On Thu, 22 Jan 2004, Vermyndax wrote:

> Greetings all...
>
> I am trying to implement a way to generate statistics for Spamassassin.
> I've tried numerous perl scripts but most of them return all zeros for
> the stats.  The biggest example I can think of is the sa-stats.pl
> script.  No matter what I do, the script returns all zeros.  Is it
> because it's having a problem parsing out my log file?  Can any of you
> perl experts give me a little help?  Or can any of you point me in the
> direction of some fantastic stats generator for SpamAssassin?

i've fixed couple things of spamstats0.4b5.pl a while ago.
see attached version. just have a try.

regards,
Matthias
#!/usr/bin/perl -w
## FIXME : remove the -w above when you find the code is fixed !
#
#This is spamstats.pl v0.4b5
#
#
#Changelog
#0.4b5 11 August 2003
#Fixed the bug when a month starting with a zero is entered as start/enddate.
#
#0.4b4 10 June 2003
# Fixed the infile == 0 bug, thanks to Yen-Ming Lee
# Fixes sendmail parsing when email is delivered through procmail, raised by Dirk 
Kuypers
#
#
#0.4b3 2 June 2003
#Applied patches from Bob Apthorpe for :
#  * more elegant fix of the two digits month intput problem
#  * better input handling, now files to process can be specified in @ARGV without the 
--file switch
#  * Added documentation and scripts to graph spamstats output with cricket.
#
#0.4b2 30 May 2003
#Regexp bugfix in exim mailer_in handling
#Regexp bugfix in spamd ("processing message" seems to have changed to "checking 
message") on some setups.
#Updated README into a more english (and less french) syntax
#
#0.4b1 19 May 2003
#This is a very tiny bugfix.
#Fixes parsing mistakes on sendmail setups that relay emails as outputs.
#Emails were undetected on those setups.
#
#0.4b 10 Mar 2003
#WARNING : this release changes the default behaviour of spamstats calculations !!
#From this version on spamstats counts spams and non-spams per recipient, not per 
mailer ID.
#(Until this version, a multirecipient message sent to both "[EMAIL PROTECTED]" and 
"[EMAIL PROTECTED]"
#counted only as one spam. From now on it counts as two.
#New option : -agglo-recipients  uses spamstats "old" mode : one count per mailer ID, 
not per recipient.
#WARNING : FOR NOW EXIM USERS PROBABLY WANT TO USE THIS OPTION, ON SOME EXIM CONFIGS 
#          THERE ARE RISKS LOG ANALYSIS BE BROKEN IF NOT USED!
#Applied patch from Jim Breton <[EMAIL PROTECTED]> for a better display.
#
#0.4 25 Feb 2003
#[Probably very incomplete] sendmail support
#Only sendmail regexp were added, no code modification !
#This is not a very important release in terms of work. Hopefully it is in terms
#of capabilities :-)
#
#0.3b2 30 Jan 2003
#Fix a problem where script will issue warnings when parsed log file is empty or 
#contains no reference to used mailer (only contains spamd messages).
#
#0.3b 04 Jan 2003
#Added a (hopefully) useful time filter specification to be used : duration 
specification.
#
#0.3a 29 Dec 2002
#Date/Time filter now works.
#Some tiny code cleanup.
#HTML output support.
#
#0.3alpha 17 Dec 2002
#Exim support
#Some work on date/time filtering support, far from complete. These options are 
useless for now.
#
#0.2f 26 Nov 2002
#If one input file does not exist, mentions which!
#
#0.2e 26 Nov 2002
#Option "-noabsolute" makes spamstats not complain if argument log file names are not 
absolute.
#Now reports total Volume of Spam and Volume of clean messages in general statistics.
#
#0.2d
#Local recipients were not counted, only relayed ones.
#Regexp was modified to just match both.
#Thanks to Jon Gabrielson for bug report
#
#0.2c
#No more lower/upper case distinction in top recipients classification
#Thanks to Kenneth Nerhood for bug report
#
#0.2b
#Fixes stupid bug from 0.2 where spamd process had to run as user "spamd"
#Thanks to Kenneth Nerhood for bug report
#
#
#Parses Postfix a spamd log file (or several) and extract top Spam receivers.
#Also displays spam statistics.
#
#Author : Vincent Deffontaines <[EMAIL PROTECTED]>
#Script Basis, Postfix support Copyright : Vincent Deffontaines 
#                                          KDX (www.kdx.fr)
#                                          Council of Europe (www.coe.int)
#
#Exim support                  Copyright : Vincent Deffontaines.
#Sendmail support                  Copyright : Vincent Deffontaines.
#
#Please send me contributions/ modifications/ comments that could be useful to this 
script!
#Others mailers than Postfix/Exim support shouldn't be hard to implement.
#Author will help and include modifications to this script as long as mailers are free 
software.
#
#This program is free software; you can redistribute it and/or
#modify it under the terms of the GNU General Public License
#as published by the Free Software Foundation; version 2
#of the License.
#
#This program is distributed in the hope that it will be useful,
#but WITHOUT ANY WARRANTY; without even the implied warranty of
#MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#GNU General Public License for more details.
#
#You should have received a copy of the GNU General Public License
#along with this program; if not, write to the Free Software
#Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
#
use strict;
use Getopt::Long;
#Only useful for Debugging, useless if you don't hack through this code :-)
#use Data::Dumper;

use Compress::Zlib;

my %infile; undef %infile;
my $number = 0;
my $help = 0;
my $nogeneral = 0;
my $debug = 0;
my $noabsolute = 0;
my $error = 0;
my $starttime = "none";
my $endtime = "none";
my $startdate = "none";
my $enddate = "none";
my $skipstarttest = 0;
my $skipendtest = 0;
my $mailerlogtype = undef;
my $html = 0;
my $duration = 0;
my $agglo_rcpt = 0;
my 
%html_tags=('br'=>'','b'=>'','i'=>'','html'=>'','body'=>'','endtag'=>'','starttag'=>'');
my $spam_percent = 0;
my $clean_percent = 0;


my %Defs = (); #Parse regexp definitions for each mailer and for spamd
$Defs{'mailer_in'}{'postfix'} = 
'^\s*[a-zA-Z]{3}\s+\d+\s+\d+:\d+:\d+\s+\S+\s+postfix\/cleanup\[(\d*)\]:\s+([^:]+):\s*message-id=(.*)$';
#$Defs{'mailer_in'}{'exim'} = 
'^\s*[a-zA-Z]{3}\s+\d+\s+\d+:\d+:\d+\s+\S+\s+exim\[\d+\]:\s+\d{4}-\d{2}-\d{2}\s+\d+:\d+:\d+\s+<[EMAIL
 PROTECTED](.*)$';
#$Defs{'mailer_in'}{'exim'} = 
'^\s*[a-zA-Z]{3}\s+\d+\s+\d+:\d+:\d+\s+\S+\s+exim\[(\d+)\]:\s+\d{4}-\d{2}-\d{2}\s+\d+:\d+:\d+\s+(\S+)\s+<[EMAIL
 PROTECTED](?:U=\S+|H=.*)\s+P=\S+\s+S=\S+\s+id=(.*)$';
$Defs{'mailer_in'}{'exim'} = 
'^\s*[a-zA-Z]{3}\s+\d+\s+\d+:\d+:\d+\s+\S+\s+exim\[(\d+)\]:\s+\d{4}-\d{2}-\d{2}\s+\d+:\d+:\d+\s+(\S+)\s+<[EMAIL
 PROTECTED](.*)$';
$Defs{'mailer_in'}{'sendmail'} = 
'^\s*[a-zA-Z]{3}\s+\d+\s+\d+:\d+:\d+\s+\S+\s+sendmail\[(\d+)\]:\s+(\S+):\s+from=<[^>]*>,\s+size=\d+,\s+class=\S+,\s+nrcpts=\d+,\s+msgid=<([^>]+)>.*,\s+proto=\S+,\s+daemon=\S+,\s+relay=.*$';
$Defs{'spamd_in'}{'postfix'} = 
'^\s*[a-zA-Z]{3}\s+\d+\s+\d+:\d+:\d+\s+\S+\s+spamd\[(\d+)\]:\s+(?:processing|checking)\s+message\s*(.*)\s+for\s+\S+';
$Defs{'spamd_in'}{'exim'} = 
'^\s*[a-zA-Z]{3}\s+\d+\s+\d+:\d+:\d+\s+\S+\s+spamd\[(\d+)\]:\s+(?:processing|checking)\s+message\s*<(.*)>\s+for\s+\S+';
$Defs{'spamd_in'}{'sendmail'} = 
'^\s*[a-zA-Z]{3}\s+\d+\s+\d+:\d+:\d+\s+\S+\s+spamd\[(\d+)\]:\s+(?:processing|checking)\s+message\s*<(.*)>\s+for\s+';
$Defs{'spamd_clean'} = 
'^\s*[a-zA-Z]{3}\s+\d+\s+\d+:\d+:\d+\s+\S+\s+spamd\[(\d+)\]:\s+clean\s+message\s*\(([^\/]+)\/[^\)]+\)\s+for\s+\S+\d+\s+in\s+(\S+)\s+seconds,\s+(\d+)\s+bytes\.';
$Defs{'spamd_spam'} =  
'^\s*[a-zA-Z]{3}\s+\d+\s+\d+:\d+:\d+\s+\S+\s+spamd\[(\d+)\]:\s+identified\s+spam\s*\(([^\/]+)\/[^\)]+\)\s+for\s+\S+\d+\s+in\s+(\S+)\s+seconds,\s+(\d+)\s+bytes\.';
$Defs{'mailer_out'}{'postfix'} = 
'^\s*[a-zA-Z]{3}\s+\d+\s+\d+:\d+:\d+\s+\S+\s+postfix\/(?:pipe|local)\[\d+\]:\s+([^:]+):\s+to=<([^>]+)>';
#'Mar 10 02:11:24 barrel postfix/smtp[20611]: 5A9BF22E04: to=<obfuscated>, 
relay=127.0.0.1[127.0.0.1], delay=2, status=sent (250 ok 1047280284 qp 20787)'
$Defs{'mailer_out'}{'exim'} = 
'^\s*[a-zA-Z]{3}\s+\d+\s+\d+:\d+:\d+\s+\S+\s+exim\[\d+\]:\s+\d{4}-\d{2}-\d{2}\s+\d+:\d+:\d+\s+(\S+)\s+=>\s+([EMAIL
 PROTECTED])\s+';
$Defs{'mailer_out'}{'sendmail'} = 
'^\s*[a-zA-Z]{3}\s+\d+\s+\d+:\d+:\d+\s+\S+\s+sendmail\[\d+\]:\s+(\S+):\s+to=(?:\|.*ctladdr=<|<)([^>]+)>.*,\s+delay=\S+,\s+xdelay=\S+,\s+mailer=\S+,\s+pri=\d+.*,\s+dsn=\S+,\s+stat=\S+';

sub Print_Usage()
{
   print "{Exim/Postfix/Sendmail} & spamd logfile analyser. Extracts top N Spam 
receivers\n";
   print "$0 [-help] [-debug][-file=/path/to/filename] [-file=...] [-number=...] 
[-nogeneral]\n";
   print "   [-startdate=dd-mm] [-starttime=hh:mm:ss] [-enddate=dd-mm] 
[-endtime=hh:mm:ss]\n";
   print "   [-duration=number of seconds] /path/to/file1 [/path/to/file2] 
[/path/to/file3.gz]\n";
   print "GENERAL OPTIONS\n";
   print "\t-debug\t\t\t: Displays informations that _might_ indicate problems while 
parsing.\n";
   print "\t-help\t\t\t: Displays this help and exits.\n";
   print "\t-file /path/file\t: Analyses mail log file for spam results (as logged by 
spamd) :\n"
        ."\t\t\t\t  Several files can be asked for parsing at a time, including .gz 
files\n"
        ."\t\t\t\t  Default /var/log/mail.log\n"
        ."\t\t\t\t  This switch is deprecated, simply specify filenames after all 
options,"
        ."\t\t\t\t   without any switch.\n";
   print "\t-number number\t\t: specifies number of top spam receivers to display 
(default : 0).\n";
   print "\t-nogeneral\t\t: do not display general stats.\n";
   print "\t-noabsolute\t\t: lets non-absolute named logfiles be processed.\n";
   print "\t-html\t\t\t: HTML output\n";
   print "TIME FILTER OPTIONS (no time filter used if no option specified)\n";
   print "\t-startdate dd-mm\t: Process only data logged from that date\n";
   print "\t\t\t\t  Default : today if starttime specified, else unused\n";
   print "\t-enddate   dd-mm\t: Process only data logged until that date\n";
   print "\t\t\t\t  Default : today if endtime specified, else unused\n";
   print "\t-starttime hh:mm:ss\t: Process only data logged from that time (default 
time : 0:00:00)\n";
   print "\t-endtime   hh:mm:ss\t: Process only data logged until that time (default 
time : current time)\n";
   print "\t-duration  seconds\t: Work only on specified duration.\n";
   print "\t\t\t\t  To be used with start XOR end{time/date}, obviously not with 
both.\n";
   print "\t\t\t\t  Default : unused\n";
   print "\t\t\t\t  Default if no other time switch : process n seconds until current 
time.\n";
   print "\tWhy no year in dates input? Just because there is no year reported in 
postfix mail logs\n";
   print "\tThis will obviously cause time filter problems around new year!\n";
   print "\t-agglo-recipients\t: Old spamstats counting. One count by mail ID, not by 
actual recipient.\n";
   print "\t\t\t\t  EXIM users WANT to set this for now!\n";
}

sub unify($$);

sub unify($$)
#Converts (value, unit) from bytes, kilobytes, megabytes into a more human readable 
expression
{
   my $volume = shift @_;
   my $unit = shift @_;
   if ((eval($volume/1024) > 5) and (($unit eq "bytes") or ($unit eq "kbytes")))
   {
      $volume = $volume/1024;
      $unit eq "Mbytes" and $unit = "Tbytes";
      $unit eq "kbytes" and $unit = "Mbytes";
      $unit eq "bytes" and $unit = "kbytes";
      unify($volume,$unit)
   }else{
      return ($volume,$unit);
   }
}

sub check_date($)
#Checks given date is correct (expected format string: "d[d]-m[m]" or "none").
#Returns 0 if correct, 1 if not.
{
   my $date = shift @_;
   $date eq 'none' and return 0;
   unless ($date =~ /^(\d{1,2})-(\d{1,2})$/)
   {
      return 1;
   }
   my $day = int($1); my $month = int($2);
   unless (($day < 32) and ($month < 13) and (($day * $month) > 0))
   { 
      return 1;
   }
   return 0;
}

sub check_time($)
#Checks given time is correct (expected format string: "h[h]:mm:ss" or "none").
#Returns 0 if correct, 1 if not.
{
   my $time = shift @_;
   $time eq 'none' and return 0;
   unless ($time =~ /^(\d{1,2}):(\d{2}):(\d{2})$/)
   {
      return 1;
   }
   my $hour = $1; my $minute = $2; my $second = $3;
   unless (($hour < 25) and ($minute < 60) and ($second < 60))
   { 
      return 1;
   }
   return 0;
}

sub convert_date_time_to_epoch($$)
#Input : date "d[d]-m[m]", time "h[h]:mm:ss"
#Output : Pseudo epoch (no year included in input)
#Returns -1 in case of trouble.
#This function will ALWAYS be BUGGY around new year days
#This function also presents a bug in case of "bisexctial" (correct word?) year (when 
Feb 29 exists).
#This is due to year not being logged, which indeeds confuse things on such days.
{
   my $date = shift @_;
   my $time = shift @_;
   unless ($date =~ /^(\d{1,2})-(\d{1,2})$/)
   {
      return -1;
   }
   my $day = int ($1); my $month = int($2);
   #Remove leading 0 in mounth is there is one
   unless ($time =~ /^(\d{1,2}):(\d{2}):(\d{2})$/)
   {
      return -1;
   }
   my $hour = int($1); my $minute = int ($2); my $second = int($3);
   my %months = ();
   $months{1} = 31; $months{2} = 28; $months{3} = 31; $months{4} = 30; $months{5} = 
31; $months{6} = 30; $months{7} = 31; $months{8} = 31; $months{9} = 30; $months{10} = 
31; $months{11} = 30; $months{12} = 31;
   my $result = ($months{$month} + $day) * 24 * 3600 + $hour * 3600 + $minute * 60 + 
$second;
   return ($result);
}

sub fill_zeros($)
#Input : a list of numbers
#Output : same numbers list, each preceeded by a zero if originally less than 2 
characters long
{
   my $list = shift @_;
   #print "DEBUG : $list\n";
   #return $list;
   foreach my $number(@$list)
   {
      $number =~/^\d$/ and $number = "0".$number;
   }
   return @$list;
}

my @infiles = ();
GetOptions( "file=s"  => [EMAIL PROTECTED],
            "number=i"   => \$number,
            "help"  => \$help,
            "debug"  => \$debug,
            "noabsolute" => \$noabsolute,
            "nogeneral" => \$nogeneral,
            "html" => \$html,
            "startdate=s" => \$startdate,
            "enddate=s" => \$enddate,
            "starttime=s" => \$starttime,
            "endtime=s" => \$endtime,
            "duration=s" => \$duration,
            "agglo-recipients" => \$agglo_rcpt);

push @infiles, @ARGV if (@ARGV);
my $defmaillog = '/var/log/mail.log';
push @infiles, $defmaillog if ($#infiles == -1 && -f $defmaillog);
foreach my $fn (@infiles) {
    if (-f $fn) {
        $infile{$fn} = 1;
    }
}


if ($help)
{
   Print_Usage();
   exit 0;
}

#Sanity checks
unless (check_date ($startdate) == 0)
{
   print STDERR "Bad input format start date was entered\n";
   $error++;
}
unless (check_date ($enddate) == 0)
{
   print STDERR "Bad input format end date was entered\n";
   $error++;
}
unless (check_time ($starttime) == 0)
{
   print STDERR "Bad input format start time was entered\n";
   $error++;
}
unless (check_time ($endtime) == 0)
{
   print STDERR "Bad input format end date was entered\n";
   $error++;
}
unless ($duration=~/^\d+$/)
{
   print STDERR "Bad input : duration is supposed to be numeric\n";
   $error++;
}

$html and 
%html_tags=('br'=>'br>','b'=>'b>','i'=>'i>','html'=>'html>','body'=>'body>','endtag'=>'</','starttag'=>'<');

foreach my $file (sort keys %infile)
{
 unless ($file =~ /[a-zA-Z\.\/ \\0-9]+/)
   {
      die "Illegal characters read in parameter file name!\n";
   }

   unless (-f $file)
   {
      print STDERR "$file : File does not exist!\n";
      $error ++;
   }
   if (($file !~/^\//) and ($noabsolute == 0))
   {
      print STDERR "$file : Path to file must be absolute, or you must specify the 
\"-noabsolute\" option\n";
      $error ++;
   }
}

if (($starttime eq "none") and ($startdate eq "none"))
{
        $skipstarttest = 1;
}
if (($enddate eq "none") and ($endtime eq "none"))
{
        $skipendtest = 1;
}

if (($duration>0) and ($skipstarttest == 0) and ($skipendtest == 0))
{
        print STDERR "Input redundancy : You may not specify starttime, endtime and 
duration\n";
        $error ++;
}
$error and exit 1;
        

if ($startdate eq 'none')
{
   my ($day,$month) = (localtime)[3,4];
   $month ++;
   $startdate = $day."-".$month;
}

if ($enddate eq 'none')
{
   my ($day,$month) = (localtime)[3,4];
   $month ++;
   $enddate = $day."-".$month;
}

$starttime eq 'none' and $starttime = '00:00:00';
if ($endtime eq 'none')
{
   my @tab = (localtime)[0,1,2];
   @tab = fill_zeros([EMAIL PROTECTED]);
   $endtime = join (':',reverse(@tab));
}
   
#   and $endtime = join(':',reverse(fill_zeros((localtime)[0,1,2])));

print 
$html_tags{'starttag'}.$html_tags{'html'}.$html_tags{'starttag'}.$html_tags{'body'};
#print "Time filter used : From $startdate $starttime to $enddate $endtime\n";

my $epoch_start = convert_date_time_to_epoch($startdate, $starttime);
my $epoch_end = convert_date_time_to_epoch($enddate, $endtime);

if ($duration > 0)
{
   if (($skipstarttest == 1) and ($skipendtest == 1))
   {
        $epoch_start = $epoch_end - $duration;
        $duration = 0;
        $skipstarttest = 0;
        $skipendtest = 0;
   }
   elsif ($skipstarttest == 1)
   {
        $epoch_start = $epoch_end - $duration;
        $duration = 0;
        $skipstarttest = 0;
        $skipendtest = 0;
   }elsif ($skipendtest == 1)
   {
        $epoch_end = $epoch_start + $duration;
        $duration = 0;
        $skipstarttest = 0;
        $skipendtest = 0;
   }
}
#print "DEBUG : $epoch_start to $epoch_end\n";

$epoch_start > $epoch_end and print STDERR "WARNING : time filter seems incorrect : it 
starts after it ends! $epoch_start > $epoch_end\n";
my %mounths = 
("Jan"=>1,"Feb"=>2,"Mar"=>3,"Apr"=>4,"May"=>5,"Jun"=>6,"Jul"=>7,"Aug"=>8,"Sep"=>9,"Oct"=>10,"Nov"=>11,"Dec"=>12);

my $is_gz = 0;
my $gz;
my $gzerrno;

my %spam = ();
my %clean = ();

my %mailer_table = ();
my %spamd_table = ();
my %spamd_pid = ();

my $spam_score = 0;
my $clean_score = 0;
my $spam_time = 0;
my $clean_time = 0;
my $basic_spam_nb = 0;
my $basic_clean_nb = 0;

my $spam_volume = 0;
my $clean_volume = 0;

my $incorrect_lines = 0; #Count unparsable lines
my $correct_lines = 0;   #Count parsable lines


#Processing
FILELOOP: foreach my $file (keys %infile)
{
   undef $mailerlogtype;
   my $first_date = "";
   my $last_date = "";
   my $line;
   my $linetime = 0;
   $is_gz = 0;
   #$eof = 0;
   if ($file=~/\.gz$/) #We have a gz file
   {
      #print BLUE "Opening $file\n".$Stag.$NewLinetag;
      $gz = gzopen($file, "r") or die "Cannot open $file : $gzerrno\n";
      $is_gz = 1;
   }
   else
   {   
      #print BLUE "Opening $file\n".$Stag.$NewLinetag;
      open(FILE, $file) or die "Unable to open file!\n";
   }

   while (1)
   #while (not $eof)
   {
      if ($is_gz)
      {
         die "File not open!\n" if not defined $gz;
         unless ($gz->gzreadline($line) > 0)
         {
            #$eof = 1;
            $gz->gzclose();
            print $html_tags{'starttag'}.$html_tags{'br'};
            print "File $file : from $first_date to $last_date\n";
            next FILELOOP;
         }
      }
      else
      {
         {
            unless (defined($line = <FILE>))
            { 
               #$eof = 1;
               close FILE;
               print $html_tags{'starttag'}.$html_tags{'br'};
               print "File $file : from $first_date to $last_date\n";
               next FILELOOP;
            }
         }
      }



       unless ($line =~ /^\s*([a-zA-Z]{3}\s+\d+\s+\d+:\d+:\d+)\s+\S+\s+/)
       {
          $incorrect_lines ++;
          next;
       }
       unless (defined $mailerlogtype)
       {
          if ($line =~ 
/^\s*([a-zA-Z]{3}\s+\d+\s+\d+:\d+:\d+)\s+\S+\s+(exim|postfix|sendmail)/)
          {
             $mailerlogtype = $2;
             #print $mailerlogtype." style log file detected\n";
          }
       }
       
       $correct_lines ++;
       if ($first_date eq "") {$first_date = $1;}
       $last_date = $1;
       if (($skipstarttest * $skipendtest) == 0)
       {
          $last_date =~ /^([a-zA-Z]{3})\s+(\d+)\s+(\d+:\d+:\d+)$/ and $linetime = 
convert_date_time_to_epoch($2."-".$mounths{$1},$3);
       }
       unless ($skipstarttest == 1)
       {
          $linetime < $epoch_start and next;
       }

       unless ($skipendtest == 1)
       {
          $linetime > $epoch_end and next;
       }


       #Here is Mailer analysis section. Spamd analysis is below.
       #We are not running this code unless we know which mailer we are having:
       if (defined $mailerlogtype)
       {
          #Email IN
          #if ($line =~ 
/^\s*[a-zA-Z]{3}\s+\d+\s+\d+:\d+:\d+\s+\S+\s+postfix\/cleanup\[(\d*)\]:\s+([^:]+):\s*message-id=(.*)$/)
          if ($line =~ /$Defs{'mailer_in'}{$mailerlogtype}/)
          {
             if (defined $mailer_table{$2})
             {
                delete $mailer_table{$2};
                if ($debug)
                {
                   print $html_tags{'starttag'}.$html_tags{'br'};
                   print "INFO: A message \"id\" already existed as $2. Deleted it 
from mailer_table before renew.\n";
                }
             }
             #Exim specific :-(
             if ($mailerlogtype eq 'exim')
             {
                my $dollar2= $2;
                my $id = undef;
                if ($3 =~ /^\s*id=(.*)$/)
                {
                   $id = $1;
                }else{
                   #print "TWO\n";
                   $id = "I_have_no_id_за:-(";   #Hope this will never be a real id ...
                }
                $mailer_table{$dollar2} = $id;
             }else{
                $mailer_table{$2} = $3;
             }
             #print "DEBUG : postfix received message on ID $3, message code $2\n";
             next;
          }


          #EMAIL SENT
          #if ($line =~ 
/^\s*[a-zA-Z]{3}\s+\d+\s+\d+:\d+:\d+\s+\S+\s+postfix\/(?:pipe|local)\[\d+\]:\s+([^:]+):\s+to=<([^>]+)>/)
          if ($line =~ /$Defs{'mailer_out'}{$mailerlogtype}/)
          {
             if (defined $mailer_table{$1})
             {
                #Exim specific code :-(
                if ($mailerlogtype eq "exim")
                {
                   my $blah = $1;
                   my $tmp_email = $2;
                   if ( $mailer_table{$1} =~ /^I_have_no_id_за:-\($/)
                   {
                      foreach my $key (keys %spamd_table)
                      {
                         if ($key =~ /$blah/)
                         {
                            #print "I think I maybe resolved a floating Exim ID\n";
                            if ($spamd_table{$key} eq "spam")
                            {
                               $spam{lc($tmp_email)} ++;
                            #   print "SPAM for $tmp_email\n";
                            }elsif ($spamd_table{$key} eq "clean")
                            {
                               $clean{lc($tmp_email)} ++;
                            #   print "CLEAN for $tmp_email\n";
                            }
                         }
                      }
                   }
                }
                #End exim specific code
                if (defined $spamd_table{$mailer_table{$1}})
                {
                   if ($spamd_table{$mailer_table{$1}} eq "spam")
                   {
                      $spam{lc($2)} ++;
                   }elsif ($spamd_table{$mailer_table{$1}} eq "clean")
                   {
                      $clean{lc($2)} ++;
                   }
                   if ($agglo_rcpt)
                   {
                      delete $spamd_table{$mailer_table{$1}};
                      delete $mailer_table{$1};
                   }
                }
             }else{
                if ($debug) 
                {
                   print 
$html_tags{'starttag'}.$html_tags{'br'}.$html_tags{'starttag'}.$html_tags{'b'};
                   print "CRITICAL : Warning : Mailer delivered a message it never 
received? id $1";
                   print $html_tags{'endtag'}.$html_tags{'b'}."\n";
                }
             }
             next;
          }
       }

       
       #if ($line =~ 
/^\s*[a-zA-Z]{3}\s+\d+\s+\d+:\d+:\d+\s+\S+\s+spamd\[(\d+)\]:\s+processing\s+message\s*(.*)\s+for\s+\S+/)
       if (defined $mailerlogtype)
       {
          if ($line =~ /$Defs{'spamd_in'}{$mailerlogtype}/)
          {
             #foreach my $key(keys %spamd_pid)
             #{
             #   if ($spamd_pid{$key} eq $2)
             #   {
             #      delete $spamd_pid{$key};
             #      print "INFO: A message \"id\" already existed as $2. Deleted it 
from spamd_pid before renew.\n";
             #   }
             #}
             $spamd_pid{$1} = $2;
             next;
          }
       }
       #Detected as NON spam - Lets delete all its references from the buffer
       #if ($line =~ 
/^\s*[a-zA-Z]{3}\s+\d+\s+\d+:\d+:\d+\s+\S+\s+spamd\[(\d+)\]:\s+clean\s+message\s*\(([^\/]+)\/[^\)]+\)\s+for\s+\S+\d+\s+in\s+(\S+)\s+seconds,\s+(\d+)\s+bytes\./)
       if ($line =~ /$Defs{'spamd_clean'}/)
       {
          if (defined $spamd_pid{$1})
          {
             $spamd_table{$spamd_pid{$1}} = "clean";
             delete ($spamd_pid{$1});
          }else{
             if ($debug)
             {
                print 
$html_tags{'starttag'}.$html_tags{'br'}.$html_tags{'starttag'}.$html_tags{'b'};
                print "CRITICAL : spamd sent an answer for a message it did not 
receive? pid $1";
                print $html_tags{'endtag'}.$html_tags{'b'}."\n";
             }
          }
          $basic_clean_nb++;
# print $basic_clean_nb." clean \n";
          $clean_score = $clean_score + $2;
          $clean_time = $clean_time + $3;
          $clean_volume = $clean_volume + $4;
          next;
       }

       #SPAM FOUND
       #if ($line =~ 
/^\s*[a-zA-Z]{3}\s+\d+\s+\d+:\d+:\d+\s+\S+\s+spamd\[(\d+)\]:\s+identified\s+spam\s*\(([^\/]+)\/[^\)]+\)\s+for\s+\S+\d+\s+in\s+(\S+)\s+seconds,\s+(\d+)\s+bytes\./)
       if ($line =~ /$Defs{'spamd_spam'}/)
       {
          if (defined $spamd_pid{$1})
          {
             $spamd_table{$spamd_pid{$1}} = "spam";
             #print "spamd_table {".$spamd_pid{$1}."} is spam\n";
             delete ($spamd_pid{$1});
          }else{
             if ($debug)
             {
                print 
$html_tags{'starttag'}.$html_tags{'br'}.$html_tags{'starttag'}.$html_tags{'b'};
                print "CRITICAL : spamd sent an answer for a message it did not 
receive? pid $1";
                print $html_tags{'endtag'}.$html_tags{'b'}."\n";
             }
          }
          $basic_spam_nb++;
#print $basic_spam_nb."spam \n";
          $spam_score = $spam_score + $2;
          $spam_time = $spam_time + $3;
          $spam_volume = $spam_volume + $4;
          next;
       }
   }
   #We are in a non-existent case!
   print STDERR "WARNING, a piece of the program that shouldnt be run was 
reached!\nInvestigate!\n";
}

my %stats = ();
foreach my $key (keys %spam)
{
   push @{$stats{$spam{$key}}}, $key;
}

unless ($nogeneral)
{
   my $nb_spam = 0;
   my $nb_clean = 0;
   foreach my $key (keys %spam)
   {
      $nb_spam = $nb_spam + $spam{$key};
      print $nb_spam."\n";
   }
   foreach my $key (keys %clean)
   {
      $nb_clean = $nb_clean + $clean{$key};
      print $nb_clean."\n";

   }

   
   #General stats
   #Ok, not beautiful code. But its only run once...
   print 
$html_tags{'starttag'}.$html_tags{'br'}.$html_tags{'starttag'}.$html_tags{'br'};
   print "Total number of emails processed by the spam filter : 
".$html_tags{'starttag'}.
         
$html_tags{'b'}.eval($basic_spam_nb+$basic_clean_nb).$html_tags{'endtag'}.$html_tags{'b'}."\n";
   print $html_tags{'starttag'}.$html_tags{'br'};
   print $html_tags{'starttag'}.$html_tags{'b'};
   if ($nb_spam+$nb_clean == 0)
   {
      $spam_percent = eval(100 * $basic_spam_nb / ($basic_spam_nb+$basic_clean_nb));
      printf("%-40s:%10d (%6.2f%%)\n", "Number of spams", $basic_spam_nb, 
$spam_percent);
   }else{
      printf("%-40s:%10s\n", "Number of spams", "n/a");
   }
   print $html_tags{'endtag'}.$html_tags{'b'};

   print $html_tags{'starttag'}.$html_tags{'br'};
   print $html_tags{'starttag'}.$html_tags{'b'};
   if ($nb_spam+$nb_clean == 0)
   {
      $clean_percent = eval(100 * $basic_clean_nb / ($basic_spam_nb+$basic_clean_nb));
      printf("%-40s:%10d (%6.2f%%)\n", "Number of clean messages", $basic_clean_nb, 
$clean_percent);
   }else{
      printf("%-40s:%10s\n", "Number of clean messages", "n/a");
   }
   print $html_tags{'endtag'}.$html_tags{'b'};
   print $html_tags{'starttag'}.$html_tags{'br'};
   printf("%-40s:", "Average message analysis time");
   print $html_tags{'starttag'}.$html_tags{'b'};
   if ($basic_spam_nb + $basic_clean_nb > 0)
   {
      printf "%10.2f",eval(($spam_time + $clean_time)/($basic_spam_nb + 
$basic_clean_nb));
   }else{
      print "n/a";
   }
   print $html_tags{'endtag'}.$html_tags{'b'};
   print " seconds\n";
   print $html_tags{'starttag'}.$html_tags{'br'};
   printf("%-40s:", "Average spam analysis time");
   print $html_tags{'starttag'}.$html_tags{'b'};
   if ($basic_spam_nb>0)
   {
      printf "%10.2f",eval($spam_time/$basic_spam_nb);
   }else{
      print "n/a";
   }
   print $html_tags{'endtag'}.$html_tags{'b'};
   print " seconds\n";
   
   print $html_tags{'starttag'}.$html_tags{'br'};
   printf("%-40s:", "Average clean message analysis time");
   print $html_tags{'starttag'}.$html_tags{'b'};
   if ($basic_clean_nb > 0)
   {
      printf "%10.2f",eval($clean_time/$basic_clean_nb);
   }else{
      print "n/a";
   }
   print $html_tags{'endtag'}.$html_tags{'b'};
   print " seconds\n";
   #
   #Spam with multiple recipients count only as one in the average...
   print $html_tags{'starttag'}.$html_tags{'br'};
   printf("%-40s:", "Average message score");
   print $html_tags{'starttag'}.$html_tags{'b'};
   if ($basic_clean_nb+$basic_spam_nb > 0)
   {
      printf 
"%10.2f",eval(($spam_score+$clean_score)/($basic_clean_nb+$basic_spam_nb));
   }else{
      print "n/a";
   }
   print $html_tags{'endtag'}.$html_tags{'b'}."\n";
   #Spam with multiple recipients count only as one in the average...
   print $html_tags{'starttag'}.$html_tags{'br'};
   printf("%-40s:", "Average spam score");
   print $html_tags{'starttag'}.$html_tags{'b'};
   if ($basic_spam_nb > 0)
   {
      printf "%10.2f",eval($spam_score/$basic_spam_nb);
   }else{
      print "n/a";
   }
   print $html_tags{'endtag'}.$html_tags{'b'}."\n";
   #Spam with multiple recipients count only as one in the average...
   print $html_tags{'starttag'}.$html_tags{'br'};
   printf("%-40s:", "Average clean message score");
   print $html_tags{'starttag'}.$html_tags{'b'};
   if ($basic_clean_nb)
   {
      printf "%10.2f",eval($clean_score/$basic_clean_nb);
   }else{
      print "n/a";
   }
   print $html_tags{'endtag'}.$html_tags{'b'}."\n";
   my $unit = "bytes";
   ($spam_volume,$unit)=unify($spam_volume,$unit);

   print $html_tags{'starttag'}.$html_tags{'br'};
   printf("%-40s:", "Total spam volume");
   print $html_tags{'starttag'}.$html_tags{'b'};
   printf "%10d",$spam_volume; print " ";
   print $html_tags{'endtag'}.$html_tags{'b'}.$unit."\n";
   $unit = "bytes";
   ($clean_volume,$unit)=unify($clean_volume,$unit);
   print $html_tags{'starttag'}.$html_tags{'br'};
   printf("%-40s:", "Total clean volume");
   print $html_tags{'starttag'}.$html_tags{'b'};
   printf "%10d",$clean_volume; print " ";
   print $html_tags{'endtag'}.$html_tags{'b'}.$unit."\n";
}

#Top spammed addresses
if ($number)
{
   print $html_tags{'starttag'}.$html_tags{'br'};
   print "Recipients with highest number of spams : (top $number)\n";
   foreach my $key (sort {$b <=> $a}keys %stats)
   {
        $number <= 0 and last;
        print $key." spams : \n";
        foreach my $email ( @{$stats{$key}})
        {
                print "\t".$email."\n";
                $number--;
        }
   }
}

if (($correct_lines == 0) or (($incorrect_lines / $correct_lines) > 0.1))
{
   print $html_tags{'starttag'}.$html_tags{'br'};
   print $html_tags{'starttag'}.$html_tags{'br'};
   print "INFO: It seems at least one input file contains other things that 
{exim/postfix} or spamd lines!\n";
}

print $html_tags{'endtag'}.$html_tags{'body'};
print $html_tags{'endtag'}.$html_tags{'html'};

Reply via email to