#!/usr/bin/perl -w # # copyright 2003, gabriel rosenkoetter # Feel free to use, modify, and redistribute this as you like with the # caveat that you must retain the above copyright notice (that includes # adding it into derived works, kids). I wouldn't mind hearing from you # (gr@eclipsed.net) if you find it useful or modify/extend it. # # Note that unless you have fold_header set to 0 (which is NOT the # default!!!), this doesn't have a prayer of working. I could probably # deal better with that, but I wrote it for my own use and I'm lazy. # # Feed me a bunch of SpamAssassin-tagged messages in mbox format like # so: # # ./sa-analyze.pl < mbox # # Then make decisions about your rule-weighting. use strict; # We want to keep statistics both by message and over all the test output # so that we can look at both how many and which tests a given message # triggered (probably to judge whether to feed that message to sa-learn # as spam) and at how tests faired over the corpus (probably to change # the ranking of that individual test based on the spam a given users # actually gets). my (%messages, %tests); my ($hits, $testline, $test); my %stats; $stats{total} = 0; $stats{peak} = 0; $stats{peakmid} = undef; $stats{min} = 1000; $stats{minmid} = undef; my $messageid = undef; my $subject = undef; my $from = undef; my $sample_rate = 5; while (<>) { # When we see a new mbox From line, we're looking at a new # message, so stash what we've got and start anew. # # The exceptions are the first and last message. On the first message, # we don't yet have anything to store. On the last we won't see another # From line. So we store nothing if we don't have a $messageid yet, # and we store once more after we run out of input (outside this # loop). #if (/^From / && ($messageid ne 'null')) { # # XXX only but if we get a postmaster bounce in here, it horks # things right up (since we see the Message-ID from it but not # the other bits... certainly can't rely on an X-Spam-Status). if (/^From / && (defined($messageid))) { $messages{$messageid}{hits} = $hits; if ($testline) { foreach $test (split/,/, $testline) { if ($messages{$messageid}{tests}) { $messages{$messageid}{tests}++; } else { $messages{$messageid}{tests} = 1; } } } else { print STDERR "Do you maybe have fold_header set to 1?\n"; } $subject = 'NONE' if ($subject =~ m/^ *$/); $from = 'NONE' if ($from =~ m/^ *$/); print STDERR "[$messageid] [$subject] [$from]\n"; $messages{$messageid}{subject} = $subject; $messages{$messageid}{from} = $from; # XXX What could I possibly have been thinking here? # $messages{$messageid}{tests} = %tests; $messageid = undef; $subject = undef; $from = undef; } # When we see a MessageID, we start storing statistics on it. # Unfortunately some MTAs say "Message-ID" and some say "Message-Id". # The fact that some spamming assholes hide their real Message-IDs # away behind X-Message-ID or something like that doesn't matter # here: we're just using the Message-ID as a unique token. Which, # um, it'd really better be... are you LISTENING to me, Microsoft? # Yeah, thought not. Oh well. At least Novell is... if (/^Message-ID: /i) { $messageid = (split())[1]; } # When we see an X-Spam-Status line, we want to parse it into # temporary vars which we'll eventually dump in the relevant hashes # (by incrementing in %tests here and by throwing the results in # %messages when we get to the next message). if (/^X-Spam-Status: /) { ($hits, $testline) = (split())[2,4]; # Need to drop the tags; we know what they are because of our # data structures. $hits =~ s/hits=//; $stats{total} += $hits; if ($hits > $stats{peak}) { $stats{peak} = $hits; $stats{peakmid} = $messageid; } if ($hits < $stats{min}) { $stats{min} = $hits; $stats{minmid} = $messageid; } if ($testline) { $testline =~ s/tests=//; foreach $test (split(/,/, $testline)) { if ($tests{$test}) { $tests{$test}++; } else { $tests{$test} = 1; } } } else { print STDERR "Do you maybe have fold_header set to 1?\n"; } } # Message-IDs are all well and good for uniquness, but Subject and # From lines are what we really care about within an MUA. if (/^Subject: / && (! (defined($subject)))) { chomp($subject = (split(/ /, $_, 2))[1]); $subject = ' ' if ($subject =~ m/^$/); } if (/^From / && (! (defined($from)))) { chomp($from = (split(/ /, $_, 2))[1]); $from = ' ' if ($from =~ m/^$/); } } # Store information from the last message. $messages{$messageid}{hits} = $hits; if ($testline) { foreach $test (split/,/, $testline) { if ($messages{$messageid}{tests}) { $messages{$messageid}{tests}++; } else { $messages{$messageid}{tests} = 1; } } } else { print STDERR "Do you maybe have fold_header set to 1?\n"; } print "Messages analyzed: ", scalar(keys %messages), "\n"; print "Mean average hits: ", $stats{total} / (scalar(keys %messages)), "\n"; print "Peak hits: $stats{peak} in\n Message-ID: $stats{peakmid}\n"; print " Subject: $messages{$stats{peakmid}}{subject}\n"; print " From: $messages{$stats{peakmid}}{from}\n"; print "Min. hits: $stats{min} in\n Message-ID: $stats{minmid}\n"; print " Subject: $messages{$stats{minmid}}{subject}\n"; print " From: $messages{$stats{minmid}}{from}\n"; print "Tests matched in a statistically-relevant ($sample_rate\% or more)"; print " number of messages:\n"; # Sorting based on value using an inline function... this is totally # grotty syntax, if you ask me, but it's the Right Perl Way to do this. # See perldoc -f sort. foreach $test (sort {$tests{$b} <=> $tests{$a}} keys %tests) { print " $test: $tests{$test}\n" if ($tests{$test} >= ($sample_rate * scalar(keys %messages)) / 100); } # So, right now, we're not doing jack with %messages, really. If I end up # wanting to, it'd be pretty easy to throw in a CLI to query for information # about given rules (or rules above a given frequency) throuigh %messages.