#!/bin/sh
# ~/bin/.sh/grepspam
# by http://www.berklix.com/~jhs/bin/.sh/grepspam
# also linked as grepspamline for manual ouput of more info

# Used by public_html/dots/Makefile & also called manually to find which
# line in a spam file contained a phrase from a list of phrases.

# Call as: grepspam /path/name/of/mail/falsely/identified/with/a/spam/phrase
#	To identify which phrase to remove from list of spam phrases.
#	either because it mis identified a genuine mail as spam,
#	or because it has a failing illegal syntax for grep.

#	Analyses a mail file (path name in $1), (a file that has usually
#	been filtered by procmail in to ~/mail/spam/phrases/ )
#	usualy to find
#	which phrase[s] in $phrases caused procmail to detect the mail as spam.
#	Usual use is for autopsy, after an innocent mail gets mis-filed as spam.

# Call as grepspam /dev/null
#	To identify any syntax errors in phrases file (takes a while!)

# To find which phrase lines cause "grepspam /dev/null" to error. do this:
#  Uncomment CheckPhrase & then run script & then "grepspam /dev/null" and
#  Control D and vi -c/'egrep: Unmatched' typescript
# JJLATER Add code to do a case on $0 whether to emit CheckPhrase.

# Limitation/ Bug: 
#  If spam is rceived with multiple spaces between every word, eg
#	"this   spam   is   often   sent" 
#  then if the phrase is listed in ~/.procmailrc_phrases_src
#	"this spam is often sent" 
#  the spam mail will be caught by ~/.procmailrc_phrases_inc
#	"this[[:space:]]+spam[[:space:]]+is[[:space:]]+often[[:space:]]+sent"
#  However if a genuine non spam is falsely caught by the spam filter,
#  'grepspam ~/mail/spam/genuine/1' will fail to detect which phrase in
#  ~/.procmailrc_phrases_src detected the mail as spam.
#  This is not a new problem, as previously, eg phrase in
#  ~/.procmailrc_phrases_src "buy our cheap (jewellery|watches)" would
#  catch spam, but could not automatcally be reverse matched by grepspam
#  from a mail with "buy our cheap watches".
#  Here's a command to run inside vi while debugging, chasing 
#	procmail: Invalid regexp
#	:1,$ s/(\[\[:space:\]\]|_)+/ /g
#  the ( & ) need delimiting with Control V

base=`basename $0`

MailFile=$1

phrases="$HOME/.DOTS/.procmailrc_phrases_src"
#	used by ~/public_html/dots/Makefile

# phrases="$phrases $HOME/.DOTS/.procmailrc_domains2block_src"
#	.procmailrc_domains2block_src is no longer appended,
#	as also no longer processed into
#	.procmailrc_phrases_inc by ~jhs/public_html/dots/Makefile

if [ "$MailFile.x" = ".x" ]; then
	echo "Error. Specify a mail file to scan EG ~/mail/spam/phrases/1"
	exit 1
fi
if [ \! -e $MailFile ]; then
	echo "Error. Not a file: $MailFile"
	exit 1
fi

# Ensure ${HOME}/.procmailrc_phrases_shrunk is up to date.
# 2017-12-05 no longer necessary since I switched from
#	.procmailrc_phrases_shrunk to .procmailrc_phrases_src
# (cd $HOME/public_html/dots ; make grepspam_hook ) > /dev/null

for i in $phrases ; do
	if [ \! -r $i ]; then
		echo "Error. Cannot read list of spam phrases: $i"
		exit 1
	fi
	done

# Aggregate all spam phrases, maybe later I will file seperately by language,
# or import external lists etc.
tmpfile=$HOME/tmp/.grepspam_erasable.`date -u +%Y-%m-%dT%H:%M:%SZ`.$$.tmp
cat $phrases > $tmpfile

IFS='
'
export IFS

# Show each line from spam phrases list that matches in the mail.
#	Do not enter spam phrases starting with a minus, ie '-',
#	as grep produces an error report.

#	IF A SYNTAX ERROR IS REPORTED,
#		such as
#			egrep: Unmatched ( or \(
#		To find the error, Uncomment the CheckPhrase: line, Then:
#			cd ~/tmp; script
#			grepspam /dev/null
#			^D
#			vi typescript
#	 Problem: A spam phrase in list of
#		highest quality of rep\\|ica
#	 with ('for i in') immediately below, shows
#	 false matches on string "ica" in mail with words such as:
#		certificate, helvetica;
#	 although the .procmailrc_phrases_inc does the right thing with
#		* highest quality of rep\|ica
#	because ~/public_html/dots/Makefile reducing \\ to \
#	converting
#		from ~/.procmailrc_phrases_shrunk
#		to   ~/.procmailrc_phrases_inc

# for i in `cat $tmpfile` ; do
#	# echo "CheckPhrase: $i"
#	nice egrep -q -i "$i" $MailFile && echo "SpamPhrase: $i"
#	# No "&& exit" as we want all matching phrases, not just the first.
#	done

# 	New 'while' below converts \\ to \ so it doesnt cause a false match.

cat $tmpfile | while read phrase ; do
	## echo CheckPhrase: $phrase
	# JJLATER add a -d debug flag instead of uncomenting last line
if [ "`basename $0`" = grepspam ]; then	#{{
	#	egrep = grep -E
	# Normal, only each matching Phrase is printed
	nice egrep -q -i "$phrase" $MailFile && echo -n "SpamPhrase: " && echo XXX $phrase XXX && 
	echo From Line: `egrep -i "$phrase" $MailFile`
else	#}{
	nice egrep    -i "$phrase" $MailFile && echo -n "SpamPhrase: " && echo XXX $phrase XXX &&
	echo From Line: `egrep -i "$phrase" $MailFile`
fi	#}}
	# Avoid an immediate exit 1 which would mean:
	#	- only listing the first spam phrase
	#	- not proceeding to clean up with rm $tmpfile
	done

# JJLATER I should switch the Makefile to use a syntax like 'for'
# further above, rather than 'while read' immediately above, as further
# above does not lose a single backslash, so I would not then need
# double backslash in .procmailrc_phrases_src

rm $tmpfile
