In this video I use a number of different command line utilities to identify duplicate files.
Print Working Directory
pwd
Find All Files in Current Directory
find ./
Find Only Files (no directories)
find ./ -type f
Find Only Non-empty Files (no directories)
find ./ -type f -not -empty
Find Only Non-empty Files and Exclude Files in Library Directory (no directories)
find ./ -type f -not -empty -not -path './/Library/*'
Get md5sum of Find Results (Mac)
find ./ -type f -not -empty -not -path './/Library/*' -exec md5 -r {} \;
Get md5sum of Find Results (Linux)
find ./ -type f -not -empty -not -path './/Library/*' -exec md5sum {} \;
Note: use “md5 -r” on Mac and “md5sum” on Linux in any future command.
Write md5sums to File
find ./ -type f -not -empty -not -path './/Library/*' -exec md5 -r {} \; > file_list.txt
Count the Number of Lines in file_list.txt
wc -l file_list.txt
Get list of md5sums for Jpeg Files
find ./ -type f -not -empty -not -path './/Library/*' -iname '*.jpg' -exec md5 -r {} \; > file_list.txt
View first 10 Lines of a File
head -n 10 file_list.txt
View first 20 Lines of File
head -n 20 file_list.txt
Sort file_list.txt
sort file_list.txt > file_list_sort.txt
Output file_list_sort.txt to Terminal
cat file_list_sort.txt
Isolate the md5sums
cat file_list_sort.txt | cut -d ' ' -f1
Count Each md5sum
cat file_list_sort.txt | cut -d ' ' -f1 | uniq -c
Remove md5sums of Files that only have 1 Copy
cat file_list_sort.txt | cut -d ' ' -f1 | uniq -c | grep -v '^ 1 '
Swap md5sum and Count
cat file_list_sort.txt | cut -d ' ' -f1 | uniq -c | grep -v '^ 1 ' | awk '{ t=$1 ; $1=$2; $2=t; print }'
Sort md5sum and Count List by Hash
cat file_list_sort.txt | cut -d ' ' -f1 | uniq -c | grep -v '^ 1 ' | awk '{ t=$1 ; $1=$2; $2=t; print }' | sort
Write Results to File
cat file_list_sort.txt | cut -d ' ' -f1 | uniq -c | grep -v '^ 1 ' | awk '{ t=$1 ; $1=$2; $2=t; print }' | sort > file_counts.txt
Join File List to md5sum Counts
join -1 1 -2 1 file_counts.txt file_list_sort.txt
Sort Joined Results
join -1 1 -2 1 file_counts.txt file_list_sort.txt |sort -k2,2 -n -k1,1
Save Joined Results to a File
join -1 1 -2 1 file_counts.txt file_list_sort.txt |sort -k2,2 -n -k1,1 > file_dups.txt