#!/bin/sh
# /home/jhs/bin/.sh/dups
# http://www.berklix.com/~jhs/bin/.sh/dups
# Bourne shell script by Julian H. Stacey 2021-09-17
# DESCRIPTION:
#  Find all duplicate files in current directory & under sub directories,
#  & output a suggested script of commands to strip duplicates.
#  Do Not run the output script blind, but first review it, eg:
#	 cd ~ ; dups > tmp/dups2edit ; vi tmp/dups2edit ; sh tmp/dups2edit
#  as tmp/dups2edit will need half the filenames re-ordered,
#  so it does not strip the wrong duplicate file,
#  eg tmp/dups2edit might suggest:
#	cmpd -d \
#	sources/master/valuable_source.c \
#	sources/scratch_temp/experiment.c \
#
#  which if run would delete your original source, not the junk temp.

# NEEDS: http://www.berklix.com/~jhs/src/bsd/jhs/bin/public/cmpd
# BUGS: Not tested on files with spaces or other punctuaton in file names, see
#	http://www.berklix.com/~jhs/bin/.sh/nospace

# Example:
#	cd ~/mail ; make clean ; dups | \
#	 grep -v .xmhcache | grep -v .mh_sequences | grep -v , \
#	 > ~/tmp/dups_script_to_vi
#  The "make clean" and "| grep -v ," are to reduce output noise,
#  from exmh transient file names that have been renamed to ,[0-9]+ ,
#  before exmh moves them from one directory to another.

# History:
#   I have run this on my large home directory & my pictures archive,
#   without problem, to seek out duplicates, that I have then Manually
#   decided which to strip against which with cmpd.

# Notes:
#  I seem to recall hearing it is possible for md5 to not be unique
#  between 2 different files ? Whether that is true or not, No problem,
#  even if md5 thinks they are the same, just in case they are not,
#  or have changed since, cmpd will only delete a file if the content
#  is exactly the same as a reference file.

# Improvements done beyond the first prototype:
# - md5 only gets run once per file to save CPU
# - The list of filename is now stored in a file, not an environment variable,
#   to avoid exhaustion of max variable size,
#   if eg scanning a whole big home directory.


echo "If running $0 on ~/._ first make clean,"			> /dev/stderr
echo "else there will be a lot of chaff from temporary files,"	> /dev/stderr
echo "& copies of stuff installed into ~/public_html"		> /dev/stderr
sleep 5

trap "rm -f ~/tmp/.dups.$$.short ~/tmp/.dups.$$.full; exit" 0 1 2 3 15

touch ~/tmp/.dups.$$.short
touch ~/tmp/.dups.$$.full
for i in `find -s . -type f \
	 \! -name ,[0-9]\* \
	 \! -name .just_com \
	 \! -name .mh_sequences \
	 \! -name 20\[0-9\]\[0-9\]\*-1-0\.html \
	 \! -name Makefile \
	 \! -name \*-thumb.jpg \
	 \! -name \*/\*-thumb.jpg \
	 \! -name \*/\.backup/\* \
	 \! -name \*/\.xvpics/\* \
	 \! -name \*\/passwd\/\* \
	 \! -name \*/neat-round.theme/\* \
	 \! -name \.xmhcache \
	 \! -name adr.fax \
	 \! -name adr.ltr \
	 \! -name adr1 \
	 \! -name adr2 \
	 \! -name back.png \
	 \! -name bot.png \
	 \! -name cmt \
	 \! -name co \
	 \! -name dept \
	 \! -name dummy.html \
	 \! -name dummy.pdf \
	 \! -name dummy.txt \
	 \! -name editdata.html \
	 \! -name email \
	 \! -name fax \
	 \! -name filelist.html \
	 \! -name index.html \
	 \! -name index.shtml \
	 \! -name left.png \
	 \! -name liner.txt \
	 \! -name makefile \
	 \! -name msg\.\* \
	 \! -name name1 \
	 \! -name name2 \
	 \! -name next.png \
	 \! -name page.html \
	 \! -name pict0\[0-9\]\[0-9\]\[0-9\]-1-0\.html \
	 \! -name prev.png \
	 \! -name ref.fax \
	 \! -name ref.ltr \
	 \! -name right.png \
	 \! -name stacey_e.txt \
	 \! -name stacey_g.txt \
	 \! -name tel \
	 \! -name top.png \
	 \! -name wpa_supplicant.conf.master \
	 \! -name \*passwd\* \
	 ` ; do
	XX=`md5 -q $i`
	echo $XX | grep -v d41d8cd98f00b204e9800998ecf8427e	\
		>> ~/tmp/.dups.$$.short
	# Zero size files are often special case dummies not to be removed,
	# they have an md5 of d41d8cd98f00b204e9800998ecf8427e
	echo "$i $XX" | grep -v d41d8cd98f00b204e9800998ecf8427e	\
		>> ~/tmp/.dups.$$.full
	done

for i in `sort ~/tmp/.dups.$$.short | uniq -d` ; do
	# grep $i ~/tmp/.dups.$$.full
	# echo "Suggestion: cmpd -d \\"
	echo "cmpd -d \\"
	grep $i ~/tmp/.dups.$$.full | sed -e s/$i// | sed -e s/\$/\\\\/
	echo
	done
rm ~/tmp/.dups.$$.short ~/tmp/.dups.$$.full
