Skip to content
Snippets Groups Projects
Commit fe1131fa authored by sveseli's avatar sveseli
Browse files

add utility to remove catalog duplicates

parent 941fd347
No related branches found
No related tags found
No related merge requests found
e=Cycle2018_2_Axalta; for f in `cat $e.diff`; do echo ; echo
"*******************************" ; echo $f; efp=`echo $f | sed
's?Cycle2018_2_Axalta/??'`; echo $efp; dm-stat-file --experiment=$e
--relative-path=$efp; echo ; echo LIST; dm-list-experiment-files
--experiment=$e experimentFilePath:$efp; echo ; read -p "Enter file id
to be deleted: " fileId; dm-delete-file --keep-in-storage
--experiment=$e --file-id=$fileId; done
#!/bin/bash
# Script for removing duplicate/bad entries from DM Catalog
usage() {
echo "Usage:"
echo " $0 <experiment name> [<work dir>]"
echo ""
}
EXPERIMENT_NAME=$1
if [ -z "$EXPERIMENT_NAME" ]; then
usage
exit 1
fi
WORK_DIR=${2:-/tmp}
mkdir -p $WORK_DIR || exit 1
echo "Using work directory $WORK_DIR for experiment $EXPERIMENT_NAME"
FULL_LIST_FILE=`realpath $WORK_DIR/$EXPERIMENT_NAME.all`
UNIQUE_LIST_FILE=`realpath $WORK_DIR/$EXPERIMENT_NAME.unique`
STAT_FILE=`realpath $WORK_DIR/$EXPERIMENT_NAME.stat`
GOOD_LIST_FILE=`realpath $WORK_DIR/$EXPERIMENT_NAME.good`
DELETED_LIST_FILE=`realpath $WORK_DIR/$EXPERIMENT_NAME.deleted`
BAD_LIST_FILE=`realpath $WORK_DIR/$EXPERIMENT_NAME.bad`
DUPLICATE_LIST_FILE=`realpath $WORK_DIR/$EXPERIMENT_NAME.duplicate`
rm -f $GOOD_LIST_FILE $BAD_LIST_FILE $DELETED_LIST_FILE $DUPLICATE_LIST_FILE
touch $GOOD_LIST_FILE $BAD_LIST_FILE $DELETED_LIST_FILE $DUPLICATE_LIST_FILE
echo "Retrieving list of all catalog entries"
dm-list-experiment-files --experiment=$EXPERIMENT_NAME --display-keys=id,experimentFilePath,fileSize,md5Sum > $FULL_LIST_FILE || exit 1
echo "Retrieving list of unique file paths"
dm-list-experiment-files --experiment=$EXPERIMENT_NAME --display-keys=experimentFilePath | sort -u | sed 's?experimentFilePath=??' | awk '{print $0}' > $UNIQUE_LIST_FILE || exit 1
nUnique=`wc -l $UNIQUE_LIST_FILE | awk '{print $1}'`
nFiles=`wc -l $FULL_LIST_FILE | awk '{print $1}'`
echo "Total number of catalog entries: $nFiles"
echo "Total number of unique files: $nUnique"
storageHost=`dm-get-experiment --experiment=$EXPERIMENT_NAME --display-keys=storageHost | sed 's?storageHost=??'`
storageDirectory=`dm-get-experiment --experiment=$EXPERIMENT_NAME --display-keys=storageDirectory | sed 's?storageDirectory=??'`
nFilesInStorage=`ssh $storageHost "find $storageDirectory -type f | wc -l"`
echo "Total number of files in storage: $nFilesInStorage $storageDirectory"
echo
OLD_IFS=$IFS
IFS=
fCount=0
while read -r f; do
fCount=`expr $fCount + 1`
IFS=$OLD_IFS
f=`echo $f | sed 's/^[[:blank:]]*//;s/[[:blank:]]*$//'`
echo "**********"
echo "Working on file: \"$f\" ($fCount / $nUnique)"
dm-stat-file --experiment=$EXPERIMENT_NAME --relative-path=$f --md5sum > $STAT_FILE || exit 1
fileSize=`cat $STAT_FILE | sed -e '1,/STAT INFO/d' | awk '{print $2}' | sed 's?fileSize=??'`
md5Sum=`cat $STAT_FILE | sed -e '1,/STAT INFO/d' | awk '{print $3}' | sed 's?md5Sum=??'`
echo "File size: $fileSize, md5Sum: $md5Sum"
nEntries=`cat $FULL_LIST_FILE | grep "experimentFilePath=$f " | wc -l`
echo "There are $nEntries catalog entries"
goodId=""
idList=`cat $FULL_LIST_FILE | grep "experimentFilePath=$f " | awk '{print $1}'`
for id in $idList; do
catFileSize=`cat $FULL_LIST_FILE | grep "$id" | awk '{print $3}' | sed 's?fileSize=??' `
catMd5Sum=`cat $FULL_LIST_FILE | grep "$id" | awk '{print $4}' | sed 's?md5Sum=??' `
if [ "$catFileSize" = "$fileSize" -a "$catMd5Sum" = "$md5Sum" ]; then
echo "Catalog info is correct for $f, $id"
if [ "x$goodId" = "x" ]; then
echo "File $id is marked as good"
echo "$f $id" >> $GOOD_LIST_FILE
goodId=$id
else
echo "File $id is marked as duplicate of $goodId"
echo "$f $id" >> $DUPLICATE_LIST_FILE
echo dm-delete-file --keep-in-storage --experiment=$EXPERIMENT_NAME --file-$id >> $DELETED_LIST_FILE
fi
else
echo "Catalog info is not correct for $f, file size: $catFileSize, md5Sum: $catMd5Sum"
echo "$f $id" >> $BAD_LIST_FILE
echo dm-delete-file --keep-in-storage --experiment=$EXPERIMENT_NAME --file-$id >> $DELETED_LIST_FILE
fi
done
done < "$UNIQUE_LIST_FILE"
echo
echo "**********"
echo
echo "Total number of files in storage : $nFilesInStorage $storageDirectory"
echo "Number of all catalog entries : `wc -l $FULL_LIST_FILE`"
echo "Number of unique catalog entries : `wc -l $UNIQUE_LIST_FILE`"
echo "Number of good catalog entries : `wc -l $GOOD_LIST_FILE`"
echo "Number of bad catalog entries : `wc -l $BAD_LIST_FILE`"
echo "Number of deleted catalog entries : `wc -l $DELETED_LIST_FILE`"
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment