#!/usr/bin/perl -w
use strict;
use POSIX qw(strftime);
use Time::Local;

$|=1;

use Mail::SpamAssassin;
use Getopt::Long;

# SA-Collect.pl V0.1: collecting ham mails for SA-Train.pl
# (C) 2005 by Alexander K. Seewald <alex@seewald.at>
# This information and newest version always available at
#   alex.seewald.at/spam.html
#
# This script collects a set of most recent mails from
# mailfolder directories. Only mbox folders (both compressed
#  and uncompressed) are supported. Use this to get a sample
# of ham mails which is as large as your collected spam mailbox.
# More details see SA-Train.pl
#
# Bugs, Feature extension requests, comments and everything
# else to alex@seewald.at.

my %mon2num = ( "Jan" => 0,
                "Feb" => 1,
                "Mar" => 2,
                "Apr" => 3,
                "May" => 4,
                "Jun" => 5,
                "Jul" => 6,
                "Aug" => 7,
                "Sep" => 8,
                "Oct" => 9,
                "Nov" => 10,
                "Dec" => 11 );

my $help=0;
my $verbose=0;
my $exclude="inbox,templates,drafts,trash,sent,unsent messages,junk,spam";
my $n=undef;
my $compressed=0;

# option for random selection instead of most recent??

my $result = GetOptions ( "v" => \$verbose,
                          "x=s" => \$exclude,
                          "n=n" => \$n,
                          "z" => \$compressed,
                          "help" => \$help );

if ($#ARGV == -1 || $help || !defined($n)) {
  print "Usage: SA-Collect.pl [maildirs] [-x excl1,excl2,..] -n #count [-h] [-v] [-z] > all.mbx\n";
  print "  Select #count most recent mails from all folders in maildirs\n";
  print "  The combined mailbox is output on stdout. It makes sense to run\n";
  print "  stdout through spamassassin -dx, so you can run SA-Train.pl -n\n";
  print "maildirs  gives a set of mail directory to search through.\n";
  print "-x ..     exclude these comma-separated mail folders.\n";
  print "  (default: inbox,templates,drafts,trash,sent,unsent messages,junk,spam )\n";
  print "  (This is a reasonable for collecting ham mails from Mozilla mail folders)\n";
  print "-n count  the desired number of mails to output (must be given)\n";
  print "-z        also consider compressed mail folders.\n";
  print "-v        verbose output. Verbose messages are sent to stderr.\n";
  print "-h        show this help (usage)\n";
  exit(0);
}

my @excludes = split(/,/,$exclude);

open (FILE,'find '.join(" ",@ARGV).' -type f -print |');
my @mailboxes = <FILE>;
close(FILE);
print STDERR $#mailboxes+1," files found. Checking for mailboxes... " if $verbose;

my @mbxs; my $exc=0;
foreach my $mailbox (@mailboxes) {
  chomp $mailbox; my $isExc=0;
  my $mailbox_=$mailbox;
  if ($compressed) {
    $mailbox_ =~ s/\.Z$//;
    $mailbox_ =~ s/\.gz$//;
  }
  foreach my $exc (@excludes) {
    if (length($mailbox_)>=length($exc) && substr($mailbox_,length($mailbox_)-length($exc),length($exc)) eq $exc) { $isExc=1; last; }
  }
  if (!$isExc) {
    my $fn = $mailbox;
    if ($compressed) { $fn = "zcat -f $mailbox |"; }
    if (open(FILE,$fn)) {
      $_=<FILE>; if (defined($_) && /^From /) { push @mbxs,$mailbox; }
      close(FILE);
    } 
  } else { $exc++; }
}

print STDERR $#mbxs+1," suitable mailboxes found ($exc excluded).\n" if $verbose;
if ($#mbxs == -1) { die "No suitable mailboxes found."; }

@mailboxes=@mbxs; @mbxs=();

# now, collect the most recent mails from all mailboxes
my $cnt=0; my @mails; my $first=1; my $allCnt=0;
foreach my $mailbox (@mailboxes) {
  my $fn=$mailbox;
  if ($compressed) { $fn = "zcat -f $mailbox |"; }
  open(FILE,$fn); my $mail="";
  while (<FILE>) {
    my $cline=$_;
    if (/^From / && $mail ne "") {
      my $t=mail2time($mail);
      if (defined($t)) {
        if ($cnt<$n) {  # fill buffer first
          push @mails,[$t,$mail];
          $cnt++;
        } else { # now, $cnt==$n
          if ($first) {  # sort buffer
            my @tmp = sort { $a->[0] < $b->[0] } @mails;
            @mails = @tmp; @tmp=(); $first=0;
          }
          if ($t > $mails[0]->[0]) {  # ignore if too old
            my $pos=1;
            while ($pos<$n && $t > $mails[$pos]->[0]) { $pos++; }
            splice @mails,0,1; # remove first element, eq. shift @mails;
            splice @mails,$pos-1,0,[$t,$mail]; # insert new element before $pos
            if (@mails != $n) { die "This should not happen."; }
          }
        }
      }
      $mail=""; $allCnt++;
    }
    $mail.=$cline;
  }
  close(FILE);
}

# output all mails to stdout
foreach my $m (@mails) {
  print $m->[1],"\n";
}

print STDERR "$n of $allCnt mails sampled (",sprintf("%.1f%%)\n",$n/$allCnt*100) if $verbose;

sub mail2time {
  my $mail = shift @_;

  my @lines = split(/\n/,$mail);
  my $dateF=undef; my $date=undef;
  while (@lines) {
    $_ = shift @lines;
    if (/^From / && !defined($dateF)) {
      my @z=split(/  */,$_); shift @z; shift @z;
      $dateF=join(" ",@z); chomp $dateF; $first=0;
    }
    if (/^Received:/ && !defined($date)) {
      my $x=$_; $_=shift @lines;
      while (/^[ \t]/) { $x.=$_; $_=shift @lines; }
      my @lines2=split(/\n/,$x);
      my $lastline=pop @lines2;
      if ($lastline =~ /\;/) { (my $k, $lastline)=split(/\;/,$lastline,2); }
      $date=$lastline; $date =~ s/^[ \t]//g; $date =~ s/[ \t]$//g;
      if ($date =~/[\x00-\x1F\x80-\xFF]/) { $date=undef; }
    }
  }

  my $timeF=undef; $timeF=header2time($dateF) unless (!defined($dateF));
  my $time;
  if (defined($date)) {
    $time=header2time($date);
  } else {
    $time=undef;
  }
  if (!defined($time) || $timeF<$time || ($timeF-$time)>3600*24*14) {
    $time=$timeF;
  }

  return $time;
}

sub header2time {
  my $header=shift @_;
  $header=~ s/^[ \t]//g;
  $header=~ s/[ \t]$//g;
  my @d=split(/[ \t][ \t]*/,$header);
  my ($sec,$min,$hours,$mday,$mon,$year);
  my $leap="";
  if ($d[0] =~ /^[a-zA-Z]*,$/) {
    $mday=$d[1];
    $mon=$mon2num{$d[2]};
    $year=$d[3];
    ($hours,$min,$sec)=split(/:/,$d[4],3);
    shift @d; shift @d; shift @d; shift @d; shift @d;
    $leap=join(" ",@d);
  } else {
    if ($d[0] =~ /^[a-zA-Z]*$/) { shift @d; }
    foreach my $x (@d) {
      if (defined($mon2num{$x})) { $mon=$mon2num{$x}; }
      elsif ($x =~ /^[0-9][0-9][0-9][0-9]$/) { $year=$x; }
      elsif ($x =~ /^[1-9][0-9]*$/) { $mday=$x; }
      elsif ($x =~ /^[0-9][0-9]\:[0-9][0-9]\:[0-9][0-9]$/) {
        ($hours,$min,$sec)=split(/:/,$x,3);
      } else { $leap.=$x." "; }
    }
  }
  my $t=timelocal($sec,$min,$hours,$mday,$mon,$year-1900);
  return $t;
}
