diff --git a/sbin/dm_remove_catalog_duplicates.sh b/sbin/dm_remove_catalog_duplicates.sh old mode 100644 new mode 100755 index 080599ced66e798ea1850e79a543cca1cf5d604e..fbede229fd4d0874ea26eae54d76d278ee5c0c0d --- a/sbin/dm_remove_catalog_duplicates.sh +++ b/sbin/dm_remove_catalog_duplicates.sh @@ -1,8 +1,92 @@ -e=Cycle2018_2_Axalta; for f in `cat $e.diff`; do echo ; echo -"*******************************" ; echo $f; efp=`echo $f | sed -'s?Cycle2018_2_Axalta/??'`; echo $efp; dm-stat-file --experiment=$e ---relative-path=$efp; echo ; echo LIST; dm-list-experiment-files ---experiment=$e experimentFilePath:$efp; echo ; read -p "Enter file id -to be deleted: " fileId; dm-delete-file --keep-in-storage ---experiment=$e --file-id=$fileId; done +#!/bin/bash +# Script for removing duplicate/bad entries from DM Catalog + +usage() { + echo "Usage:" + echo " $0 <experiment name> [<work dir>]" + echo "" +} +EXPERIMENT_NAME=$1 +if [ -z "$EXPERIMENT_NAME" ]; then + usage + exit 1 +fi +WORK_DIR=${2:-/tmp} +mkdir -p $WORK_DIR || exit 1 +echo "Using work directory $WORK_DIR for experiment $EXPERIMENT_NAME" + +FULL_LIST_FILE=`realpath $WORK_DIR/$EXPERIMENT_NAME.all` +UNIQUE_LIST_FILE=`realpath $WORK_DIR/$EXPERIMENT_NAME.unique` +STAT_FILE=`realpath $WORK_DIR/$EXPERIMENT_NAME.stat` +GOOD_LIST_FILE=`realpath $WORK_DIR/$EXPERIMENT_NAME.good` +DELETED_LIST_FILE=`realpath $WORK_DIR/$EXPERIMENT_NAME.deleted` +BAD_LIST_FILE=`realpath $WORK_DIR/$EXPERIMENT_NAME.bad` +DUPLICATE_LIST_FILE=`realpath $WORK_DIR/$EXPERIMENT_NAME.duplicate` +rm -f $GOOD_LIST_FILE $BAD_LIST_FILE $DELETED_LIST_FILE $DUPLICATE_LIST_FILE +touch $GOOD_LIST_FILE $BAD_LIST_FILE $DELETED_LIST_FILE $DUPLICATE_LIST_FILE + +echo "Retrieving list of all catalog entries" +dm-list-experiment-files --experiment=$EXPERIMENT_NAME --display-keys=id,experimentFilePath,fileSize,md5Sum > $FULL_LIST_FILE || exit 1 +echo "Retrieving list of unique file paths" +dm-list-experiment-files --experiment=$EXPERIMENT_NAME --display-keys=experimentFilePath | sort -u | sed 's?experimentFilePath=??' | awk '{print $0}' > $UNIQUE_LIST_FILE || exit 1 + +nUnique=`wc -l $UNIQUE_LIST_FILE | awk '{print $1}'` +nFiles=`wc -l $FULL_LIST_FILE | awk '{print $1}'` +echo "Total number of catalog entries: $nFiles" +echo "Total number of unique files: $nUnique" + +storageHost=`dm-get-experiment --experiment=$EXPERIMENT_NAME --display-keys=storageHost | sed 's?storageHost=??'` +storageDirectory=`dm-get-experiment --experiment=$EXPERIMENT_NAME --display-keys=storageDirectory | sed 's?storageDirectory=??'` +nFilesInStorage=`ssh $storageHost "find $storageDirectory -type f | wc -l"` +echo "Total number of files in storage: $nFilesInStorage $storageDirectory" +echo + +OLD_IFS=$IFS +IFS= +fCount=0 +while read -r f; do + fCount=`expr $fCount + 1` + IFS=$OLD_IFS + f=`echo $f | sed 's/^[[:blank:]]*//;s/[[:blank:]]*$//'` + echo "**********" + echo "Working on file: \"$f\" ($fCount / $nUnique)" + dm-stat-file --experiment=$EXPERIMENT_NAME --relative-path=$f --md5sum > $STAT_FILE || exit 1 + fileSize=`cat $STAT_FILE | sed -e '1,/STAT INFO/d' | awk '{print $2}' | sed 's?fileSize=??'` + md5Sum=`cat $STAT_FILE | sed -e '1,/STAT INFO/d' | awk '{print $3}' | sed 's?md5Sum=??'` + echo "File size: $fileSize, md5Sum: $md5Sum" + nEntries=`cat $FULL_LIST_FILE | grep "experimentFilePath=$f " | wc -l` + echo "There are $nEntries catalog entries" + goodId="" + idList=`cat $FULL_LIST_FILE | grep "experimentFilePath=$f " | awk '{print $1}'` + for id in $idList; do + catFileSize=`cat $FULL_LIST_FILE | grep "$id" | awk '{print $3}' | sed 's?fileSize=??' ` + catMd5Sum=`cat $FULL_LIST_FILE | grep "$id" | awk '{print $4}' | sed 's?md5Sum=??' ` + if [ "$catFileSize" = "$fileSize" -a "$catMd5Sum" = "$md5Sum" ]; then + echo "Catalog info is correct for $f, $id" + if [ "x$goodId" = "x" ]; then + echo "File $id is marked as good" + echo "$f $id" >> $GOOD_LIST_FILE + goodId=$id + else + echo "File $id is marked as duplicate of $goodId" + echo "$f $id" >> $DUPLICATE_LIST_FILE + echo dm-delete-file --keep-in-storage --experiment=$EXPERIMENT_NAME --file-$id >> $DELETED_LIST_FILE + fi + else + echo "Catalog info is not correct for $f, file size: $catFileSize, md5Sum: $catMd5Sum" + echo "$f $id" >> $BAD_LIST_FILE + echo dm-delete-file --keep-in-storage --experiment=$EXPERIMENT_NAME --file-$id >> $DELETED_LIST_FILE + fi + done +done < "$UNIQUE_LIST_FILE" + +echo +echo "**********" +echo +echo "Total number of files in storage : $nFilesInStorage $storageDirectory" +echo "Number of all catalog entries : `wc -l $FULL_LIST_FILE`" +echo "Number of unique catalog entries : `wc -l $UNIQUE_LIST_FILE`" +echo "Number of good catalog entries : `wc -l $GOOD_LIST_FILE`" +echo "Number of bad catalog entries : `wc -l $BAD_LIST_FILE`" +echo "Number of deleted catalog entries : `wc -l $DELETED_LIST_FILE`"