#!/usr/bin/perl
use strict;
require 5.006;

use Mail::SpamAssassin;
use Mail::SpamAssassin::ArchiveIterator;
use Algorithm::SVM 0.12;
use Algorithm::SVM::DataSet;
use Getopt::Long;

# SA-Train.pl V0.27: training a full model for SpamAssassin
# (C) 2005-06 by Alexander K. Seewald <alex@seewald.at>
# This information and newest version always available at
#   alex.seewald.at/spam.html
#
# SA-Train described in the following paper. By default, this
# uses training method "simple", which yields similar results
# and runs much faster.
#
# Seewald A.K.: An Evaluation of Naive Bayes Variants
#   in Content-Based Learning for Spam Filtering. Technical
#   Report, Österreichisches Forschungsinstitut für Artificial
#   Intelligence, Wien, TR-2005-20, 2005
#
# Option -S emulates roughly what the SA development team uses
# to arrive at their score set. Option -B is a simple benchmark
# approach which just trains the Naive Bayes model and outputs
# static scores for the BAYES_* rules. The latter seems to work
# surprisingly well.
#
# How mails should be collected for this tool at an institute
# - i.e. generating a model for a large(r) number of users
# * empty all files where SpamAssassin puts automatically
#   recognized spam mails (possibly backup first, so no
#   misclassified mails get lost)
# * ask all institute members to collect spam that comes
#   through, into separate mailboxes (e.g. spam)
# After 1-2 weeks, put all SA-recognized mails and members-
# collected mails into one mailbox per user. Check this
# mailbox extensively for misclassified mails!
# Use SA-Collect.pl script to get the same number of ham mails
# from non-spam mailboxes. This ensures a Spam-to-Ham ratio of
# 1.0. Again, check this mailbox extensively for misclassifications!
# Train via -x 2, 5 or 10 - depending on how long you can
# wait. ;-)  Afterwards, full training (default, -x 0/1)
# will give user_prefs and bayes_* files for SpamAssassin.
# You will need at least 500 mails - the more, the better.
# SA-Train at OFAI (www.ofai.at) uses around 50,000 mails
# and offers excellent performance for 6-12 months.
#
# Bugs, Feature extension requests, comments and everything
# else to alex@seewald.at


$|=1;

my $verbose=0;
my $spamFolders=undef;
my $hamFolders=undef;
my $dontRemoveMarkup=0;
my $folds=1;    # for CV; default: just train everything
my $testfold=1; # test fold
my $complexity=1.0;
my $bayesOnly=0;
my $smoOnly=0;
my $progress=0; # output dots for progress
my $outputScoreFile=undef; # output full scores for CV into this file
my $outputDataFile=undef; # output full datafile (data+y)
my $outputDataTFile=undef; # output full dataTestfile (data+y)
my $outputWeightsFile=undef; # output weights+bias (w+m_b)
my $help=0; # output help

my $result = GetOptions ( "v" => \$verbose,
                          "spam=s" => \$spamFolders,
                          "ham=s"  => \$hamFolders,
                          "n" => \$dontRemoveMarkup,
                          "x=n" => \$folds,
                          "c=f" => \$complexity,
													"B" => \$bayesOnly,
													"S" => \$smoOnly,
                          "h" => \$help,
                          "progress" => \$progress,
                          "o=s" => \$outputScoreFile,
                          "d=s" => \$outputDataFile,
                          "t=s" => \$outputDataTFile,
													"w=s" => \$outputWeightsFile );

if ($help || $folds < 0 || $folds > 10 || $complexity <= 0 || !defined($spamFolders) || !defined($hamFolders) || ($bayesOnly*$smoOnly==1) || ($folds<=1 && $outputDataTFile)) {
  print "Usage: SA-Train.pl [-spam mbox1,mbox2,..] [-ham mboxA,mboxB,..] [-v] [-p] [-n] [-x folds] [-c lambda] [-o scorefile]\n  [-d datafile] [-w weightsfile] [-B|-S]\n\n";
  print "-spam  comma-separated list of spam mailboxes\n";
  print "-ham   comma-separated list of ham mailboxes\n";
  print "-v     verbose - output status messages\n";
  print "-p     progress - output dots for each step\n";
  print "-n     do not remove SpamAssassin markup\n";
  print "-x #f  number of folds for CV (default: full training)\n";
  print "-c #l  lambda complexity parameter for linear SVM (default 1.0)\n";
  print "       (try values between 0.001 and 1000 to get better ham error)\n";
  print "-B     Only train NaiveBayes, use static weights on BAYES_* rules\n";
  print "-S     Only train SMO, do not train any mails via NaiveBayes\n";
  print "-o fn  output full scores into filename\n";
  print "-d fn  output data file into filename\n";
  print "-t fn  output test data file into filename (only for CV)\n";
  print "-w fn  output weights+bias file into filename\n";
  print "-h     Output this help (usage)\n";
  print "\nOn a recent machine, a mailbox with 77286 mails and fivefold CV\n";
  print "(-x 5) takes about 19h and 480M of main memory.\n";
  print "\n";
  exit(0);
}

# ham/spam folder filenames are assumed to be comma-separated
my @sF = split(/,/,$spamFolders);
my @hF = split(/,/,$hamFolders);

# remove markup and create temp files
my $sCnt=0; my $hCnt=0;
my $txt = ($dontRemoveMarkup ? "C" : "Remove markup and c");
print $txt."reate temp files\n" if $verbose;
my ( $sTmp, $handle ) = Mail::SpamAssassin::Util::secure_tmpfile();
foreach my $file (@sF) {
  open(FILE,"zcat -f $file ".($dontRemoveMarkup ? "" : "| spamassassin -dx ")."|");
  while(<FILE>) {
		if (/^From /) { $sCnt++; }
    print {$handle} $_;
  }
  print {$handle} "\n";
  close(FILE);
}
close $handle;

( my $hTmp, $handle ) = Mail::SpamAssassin::Util::secure_tmpfile();
foreach my $file (@hF) {
  open(FILE,"zcat -f $file ".($dontRemoveMarkup ? "" : "| spamassassin -dx ")."|");
  while(<FILE>) {
		if (/^From /) { $hCnt++; }
    print {$handle} $_;
  }
  print {$handle} "\n";
  close(FILE);
}
close $handle;

if (!$smoOnly && ($sCnt<220 || $hCnt<220)) {
  if (defined($sTmp)) { unlink $sTmp; }
  if (defined($hTmp)) { unlink $hTmp; }
  die "Need at least 200 ham mails and 200 spam mails for any method except Only train SMO (-S)";
}  

# initialize SA
print "Initialize SpamAssassin\n\n" if $verbose;

# set rules_filename, site_rules_filename, and userprefs_filename
my @rules = ( '/usr/share/spamassassin', '/usr/local/share/spamassassin', '/usr/share/spamassassin' );
my @site_rules = ( '/usr/etc/mail/spamassassin', '/usr/etc/spamassassin', '/usr/local/etc/spamassassin', '/usr/pkg/etc/spamassassin', '/usr/etc/spamassassin', '/etc/mail/spamassassin', '/etc/spamassassin' );

my @fs=(); my $dir;
while (@rules && !@fs) {
  $dir = shift @rules;
  @fs = (<$dir/*.cf>,<$dir/*.pre>);
}
if (not @rules) { die "No rulesets found."; }
my $rules_filename = $dir;

my @fs2=();
while (@site_rules && !@fs2) {
  $dir = shift @site_rules;
  @fs2 = (<$dir/*.cf>,<$dir/*.pre>);
}
if (not @site_rules) { die "No site rulesets found."; }
my $site_rules_filename = $dir;

my $path=`pwd`; chomp $path; $path.="/bayes".$$;
my $config_text="bayes\_auto\_learn 0\nuse\_auto\_whitelist 0\nauto\_whitelist\_factor 0\nbayes\_path $path\n";
if ($smoOnly) { $config_text.="use\_bayes 0\nuse\_bayes\_rules 0\n"; }

open(FILE,">up.$$");
print FILE $config_text;
close(FILE);

my $sa = new Mail::SpamAssassin( { dont_copy_prefs => 1, userstate_dir => "/tmp", local_tests_only => 1 , rules_filename => $rules_filename, site_rules_filename => $site_rules_filename, userprefs_filename => "up.$$" } );
$sa->init(1);

my %allTests=();
for (my $i=0; $i<=3; $i++) {
  foreach my $test (keys %{$sa->{conf}->{scoreset}->[$i]}) {
    $allTests{$test}=1;
  }
}

if ($folds<=1) {
  print "Training full model.\n" if $verbose;
	if (!$smoOnly) {
  	print "Start trainBayes with spam file $sTmp, ham file $hTmp " if $verbose;
  	my ($sFcnt,$sLearned,$hFcnt,$hLearned)=trainBayes($sa,$sTmp,$hTmp,1,1);

  	if ($verbose) { 
   		print "\n$sFcnt spams, $hFcnt hams submitted to NaiveBayes training (SH ratio = ",($hFcnt==0 ? "inf" : sprintf("%.1f",$sFcnt/$hFcnt)),")\n"; 
    	print "$sLearned spams, $hLearned hams actually learned (SH ratio = ",($hLearned==0 ? "inf" : sprintf("%.1f",$sLearned/$hLearned)),")\n"; 
  	}
	}

  my ($tset,$test2idx,$maxI,$weights,$m_b,%idx2test);
	if (!$bayesOnly) {
 		print "Running SpamAssassin tests " if $verbose;
 		($tset,$test2idx,$maxI) = runSATrain($sa,$sTmp,$hTmp,1,1);
 		print "\n" if $verbose;

 		my $n = @{$tset}; my $m = $maxI;
 		print "Running SMO on $n rows and $m columns with lambda=$complexity " if $verbose;
 		($weights,$m_b)=trainSMO($tset,$complexity,$maxI);
 		print "\n" if $verbose;

  	my ($hErr,$sErr,$full_scores)=chkError($tset,$weights,$m_b);
  	print "Training set error: ham=",sprintf("%.3f%%",$hErr*100),", spam=",sprintf("%.3f%%",$sErr*100)," (should be almost zero)\n";
  	if ($outputScoreFile) {
    	open(FILE,"> ".$outputScoreFile);
    	for (my $i=0; $i<@{$tset}; $i++) {
      	print FILE $$full_scores[$i],"\t",$$tset[$i]->label(),"\n";
    	}
    	close(FILE);
  	}
		my %idx2test;
		foreach my $k (keys %{$test2idx}) {
			$idx2test{$$test2idx{$k}}=$k;
		}
  	if ($outputDataFile) {
    	open(FILE,"> ".$outputDataFile);
			for (my $i=0; $i<$maxI; $i++) {
				print FILE $idx2test{$i},"\t";
			}
			print FILE "true_class\n";
    	for (my $i=0; $i<@{$tset}; $i++) {
				print FILE join("\t",$$tset[$i]->asArray($maxI)),"\t",$$tset[$i]->label(),"\n";
    	}
    	close(FILE);
  	}
		if ($outputWeightsFile) {
			open(FILE,"> ".$outputWeightsFile);
			for (my $i=0; $i<@{$weights}; $i++) {
				print FILE $idx2test{$i},"\t";
			}
			print FILE "bias\n";
			for (my $i=0; $i<@{$weights}; $i++) {
				print FILE $$weights[$i],"\t";
			}
			print FILE $m_b,"\n";
			close(FILE);
		}
	} else {
    # static weights for onlyBayes, might change for SA releases..
		$$test2idx{"BAYES_00"}=0; push @{$weights},(0.00+0.01)/2;
		$$test2idx{"BAYES_05"}=1; push @{$weights},(0.01+0.05)/2;
		$$test2idx{"BAYES_20"}=2; push @{$weights},(0.05+0.20)/2;
		$$test2idx{"BAYES_40"}=3; push @{$weights},(0.20+0.40)/2;
		$$test2idx{"BAYES_50"}=4; push @{$weights},(0.40+0.60)/2;
		$$test2idx{"BAYES_60"}=5; push @{$weights},(0.60+0.80)/2;
		$$test2idx{"BAYES_80"}=6; push @{$weights},(0.80+0.95)/2;
		$$test2idx{"BAYES_95"}=7; push @{$weights},(0.95+0.99)/2;
		$$test2idx{"BAYES_99"}=8; push @{$weights},(0.99+1.00)/2;
		$m_b=0.5;

    print "Running SpamAssassin tests " if $verbose;
    my ($tsetT,$bayesProbT) = runSATest($sa,$sTmp,$hTmp,$test2idx,1,0,$bayesOnly);
    print "\n" if $verbose;
    my ($hErr,$sErr,$full_scores)=chkError($tsetT,$weights,$m_b);
  	print "Training set error: ham=",sprintf("%.3f%%",$hErr*100),", spam=",sprintf("%.3f%%",$sErr*100)," (should be almost zero)\n";
    if ($outputScoreFile) {
      open(FILE,"> ".$outputScoreFile);
      for (my $i=0; $i<@{$tsetT}; $i++) {
				if ($bayesOnly) {
        	print FILE $$full_scores[$i],"\t",$$tsetT[$i]->label(),"\t",$$bayesProbT[$i],"\n";
				} else {
        	print FILE $$full_scores[$i],"\t",$$tsetT[$i]->label(),"\n";
				}
      }
      close(FILE);
    }
	}
  createUserPrefs($weights,$m_b,$test2idx,$path,\%allTests,$bayesOnly,$smoOnly,($bayesOnly ? 10 : 1e+13));

  print "user_prefs$$".($smoOnly ? " has" : " and bayes$$\_* have")." been created in the current directory.\n";
  print "Please move to correct directory and remove $$ postfix.\n";
  print "Modify or remove bayes_path in the last line of user_prefs as appropriate.\n" unless ($smoOnly);
} else {
  # do CV!
  print "\thErr\tsErr\n" if not $verbose;
  my @hErr; my @sErr;
  for (my $testfold=0; $testfold<$folds; $testfold++) {
    print "Fold $testfold:";
    print "\n" if $verbose;
		if (!$smoOnly) {
    	print "Start trainBayes with spam file $sTmp, ham file $hTmp " if $verbose;
    	my ($sFcnt,$sLearned,$hFcnt,$hLearned)=trainBayes($sa,$sTmp,$hTmp,$folds,$testfold);
    	if ($verbose) { 
     		print "\n$sFcnt spams, $hFcnt hams submitted to NaiveBayes training (SH ratio = ",($hFcnt==0 ? "inf" : sprintf("%.1f",$sFcnt/$hFcnt)),")\n"; 
    		print "$sLearned spams, $hLearned hams actually learned (SH ratio = ",($hLearned==0 ? "inf" : sprintf("%.1f",$sLearned/$hLearned)),")\n"; 
    	}
		}
		my ($weights,$m_b)=(undef,undef);
		my ($tset,$maxI,$test2idx,%idx2test)=(undef,undef,undef,());
		if (!$bayesOnly) {
    	print "Running SpamAssassin tests (training set)" if $verbose;
    	($tset,$test2idx,$maxI) = runSATrain($sa,$sTmp,$hTmp,$folds,$testfold);
			foreach my $k (keys %{$test2idx}) {
				$idx2test{$$test2idx{$k}}=$k;
			}
    	if ($outputDataFile) {
     		open(FILE,"> ".$outputDataFile.".$testfold");
				for (my $i=0; $i<$maxI; $i++) {
					print FILE $idx2test{$i},"\t";
				}
				print FILE "true_class\n";
      	for (my $i=0; $i<@{$tset}; $i++) {
       		print FILE join("\t",$$tset[$i]->asArray($maxI)),"\t",$$tset[$i]->label(),"\n";
      	}
    		close(FILE);
			}
    	print "\n" if $verbose;
    	my $n = @{$tset}; my $m = $maxI;
    	print "Running SMO on $n rows and $m columns with lambda=$complexity " if $verbose;
    	($weights,$m_b)=trainSMO($tset,$complexity,$maxI);
    	print "\n" if $verbose;
		} else {
      # static weights for onlyBayes, might change for SA releases..
			$$test2idx{"BAYES_00"}=0; push @{$weights},(0.00+0.01)/2;
			$$test2idx{"BAYES_05"}=1; push @{$weights},(0.01+0.05)/2;
			$$test2idx{"BAYES_20"}=2; push @{$weights},(0.05+0.20)/2;
			$$test2idx{"BAYES_40"}=3; push @{$weights},(0.20+0.40)/2;
			$$test2idx{"BAYES_50"}=4; push @{$weights},(0.40+0.60)/2;
			$$test2idx{"BAYES_60"}=5; push @{$weights},(0.60+0.80)/2;
			$$test2idx{"BAYES_80"}=6; push @{$weights},(0.80+0.95)/2;
			$$test2idx{"BAYES_95"}=7; push @{$weights},(0.95+0.99)/2;
			$$test2idx{"BAYES_99"}=8; push @{$weights},(0.99+1.00)/2;
			$m_b=0.5;
		}

		if ($outputWeightsFile && !$bayesOnly) {
			open(FILE,"> ".$outputWeightsFile.".$testfold");
			for (my $i=0; $i<@{$weights}; $i++) {
				print FILE $idx2test{$i},"\t";
			}
			print FILE "bias\n";
			for (my $i=0; $i<@{$weights}; $i++) {
				print FILE $$weights[$i],"\t";
			}
			print FILE $m_b,"\n";
      close(FILE);
		}
    $tset=undef;
    print "Running SpamAssassin tests (test set)" if $verbose;
    my ($tsetT,$bayesProbT) = runSATest($sa,$sTmp,$hTmp,$test2idx,$folds,$testfold,$bayesOnly);
    print "\n" if $verbose;
    if ($outputDataTFile) {
    	open(FILE,"> ".$outputDataTFile.".$testfold");
			for (my $i=0; $i<$maxI; $i++) {
				print FILE $idx2test{$i},"\t";
			}
			print FILE "true_class\n";
     	for (my $i=0; $i<@{$tsetT}; $i++) {
     		print FILE join("\t",$$tsetT[$i]->asArray($maxI)),"\t",$$tsetT[$i]->label(),"\n";
     	}
    	close(FILE);
		}
    my ($hErr,$sErr,$full_scores)=chkError($tsetT,$weights,$m_b);
    if ($outputScoreFile) {
      open(FILE,"> ".$outputScoreFile.".$testfold");
      for (my $i=0; $i<@{$tsetT}; $i++) {
				if ($bayesOnly) {
        	print FILE $$full_scores[$i],"\t",$$tsetT[$i]->label(),"\t",$$bayesProbT[$i],"\n";
				} else {
        	print FILE $$full_scores[$i],"\t",$$tsetT[$i]->label(),"\n";
				}
      }
      close(FILE);
    }
    print "\t",sprintf("%.3f%%",$hErr*100),"\t",sprintf("%.3f%%",$sErr*100),"\n" if not $verbose;
    print "Fold $testfold:\t",sprintf("%.3f%%",$hErr*100),"\t",sprintf("%.3f%%",$sErr*100),"\n" if $verbose;
    push @hErr,$hErr; push @sErr,$sErr;
  }
  print "\n" if $verbose;
  my ($hE,$hEs)=stdDev(@hErr);
  my ($sE,$sEs)=stdDev(@sErr);
  if ($verbose) {
    print "\thErr\tsErr\n";
    my $testfold=0;
    while (@hErr) {
      print "Fold $testfold:\t"; $testfold++;
      my $hE=shift @hErr; my $sE=shift @sErr;
      print sprintf("%.3f%%",$hE * 100),"\t",sprintf("%.3f%%",$sE * 100),"\n";
    }
  }
  print "-\t-\t-\n";
  print "Avg.\t",sprintf("%.3f%%",$hE*100),"\t",sprintf("%.3f%%",$sE*100),"\n";
  print "StD.\t",sprintf("%.3f%%",$hEs*100),"\t",sprintf("%.3f%%",$sEs*100),"\n";

  unlink "bayes$$\_seen";
  unlink "bayes$$\_toks";
  unlink "bayes$$\_journal";
}
    
if (defined($sTmp)) { unlink $sTmp; }
if (defined($hTmp)) { unlink $hTmp; }
unlink "up.$$";


# Subroutines

sub trainBayes {
  my ($sa,$sTmp,$hTmp,$folds,$testfold) = @_;
  # ref. to initialized spamassassin, pathname to spam mbox file, pathname to ham mbox file, no. of folds, fold to be used for testing
  unlink "bayes$$\_seen";
  unlink "bayes$$\_toks";
  unlink "bayes$$\_journal"; # just to be on the safe side
  my $sLearned=0; my $hLearned=0;
  my $sFcnt=0; my $hFcnt=0;

  # for now, always clear NB model before training
  $sa->{bayes_scanner}->{store}->clear_database();

  $sa->init_learner( { caller_will_untie => 0 } );

  my $cnt=0;
  my $iter = new Mail::SpamAssassin::ArchiveIterator( { 'opt_j' => 0, 'opt_n' => 1, 'opt_all' => 1 } );
  $iter->set_functions( sub { if ($cnt % $folds != $testfold) {
                                $sFcnt++;
                                my ($class,$id,$time,$dataref) = @_;
                                my $ma = Mail::SpamAssassin->parse($dataref);
                                my $status=$sa->learn($ma,undef,1,0);
                                if ($status->did_learn()==1) { $sLearned++; }
                                $status->finish(); $ma->finish();
                                print "." if $progress;
                              }; $cnt++;
                            }, sub { } );
  eval { $iter->run("spam:mbox:".$sTmp) };

  # during spam training(>5x): Parsing of undecoded UTF-8 will give
  # garbage when decoding entities at ..SpamAssassin/HTML.pm
  # line 182.
  # -> ignore

  $cnt=0;
  $iter = new Mail::SpamAssassin::ArchiveIterator( { 'opt_j' => 0, 'opt_n' => 1, 'opt_all' => 1 } );
  $iter->set_functions( sub { if ($cnt % $folds != $testfold) {
                                $hFcnt++;
                                my ($class,$id,$time,$dataref) = @_;
                                my $ma = Mail::SpamAssassin->parse($dataref);
                                my $status=$sa->learn($ma,undef,0,0);
                                if ($status->did_learn()==1) { $hLearned++; }
                                $status->finish(); $ma->finish();
                                print "." if $progress;
                              }; $cnt++;
                            }, sub { } );
  eval { $iter->run("ham:mbox:".$hTmp) };

  $sa->rebuild_learner_caches();
  $sa->finish_learner();

  return ($sFcnt,$sLearned,$hFcnt,$hLearned);
}

sub runSATrain {
  my ($sa,$sTmp,$hTmp,$folds,$testfold) = @_;
  # returns ref. to a @tset array

  my $cnt=0; my %test2idx; my @tset=(); my $testCnt=0;

  my $iter = new Mail::SpamAssassin::ArchiveIterator( { 'opt_j' => 0, 'opt_n' => 1, 'opt_all' => 1 } );
  $iter->set_functions( sub { if ($cnt % $folds != $testfold) {
                                my ($class,$id,$time,$dataref) = @_;
                                my $ma = Mail::SpamAssassin->parse($dataref);
                                my $status=$sa->check($ma);
                                my @tests = split(/,/,$status->get_names_of_tests_hit());
																my $ds = new Algorithm::SVM::DataSet(Label => "+1", Data => []);
                                foreach (@tests) {
																	if (!defined($test2idx{$_})) {
																		$test2idx{$_}=$testCnt;
																		$testCnt++;
																	}
																	$ds->attribute($test2idx{$_},1);
                                }
                                push @tset,$ds;
                                  
                                $status->finish(); $ma->finish();
                                print "." if $progress;
                              }; $cnt++;
                            }, sub { } );
  eval { $iter->run("spam:mbox:".$sTmp) };

  $cnt=0;
  $iter = new Mail::SpamAssassin::ArchiveIterator( { 'opt_j' => 0, 'opt_n' => 1, 'opt_all' => 1 } );
  $iter->set_functions( sub { if ($cnt % $folds != $testfold) {
                                my ($class,$id,$time,$dataref) = @_;
                                my $ma = Mail::SpamAssassin->parse($dataref);
                                my $status=$sa->check($ma);
                                my @tests = split(/,/,$status->get_names_of_tests_hit());
																my $ds = new Algorithm::SVM::DataSet(Label => "-1", Data => []);
                                foreach (@tests) {
																	if (!defined($test2idx{$_})) {
																		$test2idx{$_}=$testCnt;
																		$testCnt++;
																	}
																	$ds->attribute($test2idx{$_},1);
                                }
                                push @tset,$ds;

                                $status->finish(); $ma->finish();
                                print "." if $progress;
                              }; $cnt++;
                            }, sub { } );
  eval { $iter->run("ham:mbox:".$hTmp) };

  my $maxI = (keys %test2idx);

  return (\@tset,\%test2idx,$maxI);
}

sub runSATest {
  my ($sa,$sTmp,$hTmp,$test2idx,$folds,$testfold,$bayesOnly) = @_;
  # returns ref. to a @tset array
  # runs the mails from the test fold

  my $cnt=0; my @tset=(); my @bayesProb=();
  my $iter = new Mail::SpamAssassin::ArchiveIterator( { 'opt_j' => 0, 'opt_n' => 1, 'opt_all' => 1 } );
  $iter->set_functions( sub { if ($cnt % $folds == $testfold) {
                                my ($class,$id,$time,$dataref) = @_;
                                my $ma = Mail::SpamAssassin->parse($dataref);
                                my $status=$sa->check($ma);
                                my @tests = split(/,/,$status->get_names_of_tests_hit());
																my $ds = new Algorithm::SVM::DataSet(Label => "+1", Data => []);
                                foreach (@tests) {
																	if (defined($$test2idx{$_})) {
																		$ds->attribute($$test2idx{$_},1);
                                	}
																}
                                push @tset,$ds;

																if ($bayesOnly) {
																	push @bayesProb,$status->{bayes_score};
																}
                                  
                                $status->finish(); $ma->finish();
                                print "." if $progress;
                              }; $cnt++;
                            }, sub { } );
  eval { $iter->run("spam:mbox:".$sTmp) };

  $cnt=0;
  $iter = new Mail::SpamAssassin::ArchiveIterator( { 'opt_j' => 0, 'opt_n' => 1, 'opt_all' => 1 } );
  $iter->set_functions( sub { if ($cnt % $folds == $testfold) {
                                my ($class,$id,$time,$dataref) = @_;
                                my $ma = Mail::SpamAssassin->parse($dataref);
                                my $status=$sa->check($ma);
                                my @tests = split(/,/,$status->get_names_of_tests_hit());
																my $ds = new Algorithm::SVM::DataSet(Label => "-1", Data => []);
                                foreach (@tests) {
																	if (defined($$test2idx{$_})) {
																		$ds->attribute($$test2idx{$_},1);
                                	}
																}
                                push @tset,$ds;

																if ($bayesOnly) {
																	push @bayesProb,$status->{bayes_score};
																}
                                  
                                $status->finish(); $ma->finish();
                                print "." if $progress;
                              }; $cnt++;
                            }, sub { } );
  eval { $iter->run("ham:mbox:".$hTmp) };

  return (\@tset,\@bayesProb);
}


sub trainSMO {
  my ($tset,$cx,$maxI) = @_;
  if (!defined($cx)) { $cx=1.0; } # def. complexity parameter

	my $svm = new Algorithm::SVM(Type => 'C-SVC', Kernel => 'linear', C => $cx);
	$svm->train(@{$tset});

	my $ds = new Algorithm::SVM::DataSet(Label => '+1', Data => []);
	my $bias = $svm->predict_value($ds);
	my @weights_p=();
	for (my $i=0; $i<$maxI; $i++) {
    $ds->attribute($i,1); if ($i>0) { $ds->attribute($i-1,0); }
		push @weights_p,$svm->predict_value($ds)-$bias;
	}
	$bias=-$bias;
	return (\@weights_p,$bias);
}

sub chkError {
  # check training/test set error
  my ($tset,$weights,$m_b) = @_;

  # sort weights by absolute numeric value ascending
  # (to get rid of numerical instability problems)
  my %wp; my %wn;
  for (my $j=0; $j<@{$weights}; $j++) {
		if ($$weights[$j]>0) {
			$wp{$j}=$$weights[$j];
		} else {
			$wn{$j}=$$weights[$j];
		}
	}
	if (-$m_b>0) { $wp{-1}=-$m_b; } else { $wn{-1}=-$m_b; };
  my @permP = sort { abs($wp{$a}) <=> abs($wp{$b}) } keys %wp;
  my @permN = sort { abs($wn{$a}) <=> abs($wn{$b}) } keys %wn;

  my @scores; my $hCnt=0; my $sCnt=0; my $hErr=0; my $sErr=0;
  my $maxI=@{$weights};
  foreach my $ds (@{$tset}) {
    my $sp=0; my @vals=$ds->asArray($maxI);
    foreach my $j (@permP) {
			if ($j==-1) { $sp-=$m_b; } else {
      	$sp+=$vals[$j]*$$weights[$j];
			}
    }
		my $sn=0;
    foreach my $j (@permN) {
			if ($j==-1) { $sn-=$m_b; } else {
      	$sn+=$vals[$j]*$$weights[$j];
			}
		}
		my $s=$sp+$sn;
    push @scores,$s;
    if ($s>=0) {
      if ($ds->label()==-1) {
        $hErr++;
        $hCnt++;
      } else {
        $sCnt++;
      }
    } else {
      if ($ds->label()==-1) {
        $hCnt++;
      } else {
        $sErr++;
        $sCnt++;
      }
    }
  }
  return ($hErr/$hCnt, $sErr/$sCnt, \@scores);
}

sub stdDev {
  my $s=0; my $sq=0; my $n=0;
  foreach my $e (@_) {
    $s+=$e; $sq+=($e*$e); $n++;
  }
  my $var;
  if ($n==0 || $n==1) { $var=0; } else {
    $var=($sq/($n-1))-2*$s/$n*$s/($n-1)+($s/$n)*($s/$n)*$n/($n-1);
    if ($var<0) { $var=0; };
  }
  my $mean;
  if ($n==0) { $mean=undef; } else {
    $mean=$s/$n;
  }
  my $std;
  if ($n==0) { $std="inf" } elsif ($n==1) { $std=0 } else {
    $std=sqrt($var);
  }
  return ($mean,$std);
}

sub createUserPrefs {
  my ($weights,$m_b,$test2idx,$bayes_path,$allTests,$bayesOnly,$smoOnly,$factor) = @_;

  open(FILE,">user_prefs$$");
  print FILE <<USER_PREFS
##############################################################################
# SpamAssassin user preferences file.  See 'perldoc Mail::SpamAssassin::Conf'
# for details of what can be tweaked.
#
# Created by SA-Train.pl (C) 2005-06 Alexander K. Seewald <alex\@seewald.at>
##############################################################################

# How many hits before a mail is considered spam.
USER_PREFS
;

  print FILE "required\_hits\t",$m_b*$factor,"\n\n";

  print FILE <<USER_PREFS
# switch off auto-learn... learn only from controlled data!
bayes_auto_learn      0
use_auto_whitelist    0
auto_whitelist_factor 0

# Rewrite subject
rewrite_subject   0
# report_safe off (i.e. mail is exactly the same, except for header)
report_safe 0
report_header 1
use_terse_report 1
defang_mime 0

# Whitelist and blacklist addresses are now file-glob-style patterns, so
# "friend\@somewhere.com", "*\@isp.com", or "*.domain.net" will all work.
# whitelist_from  someone\@somewhere.com

# Add your own customised scores for some tests below.  The default scores are
# read from the installed spamassassin rules files, but you can override them
# here.  To see the list of tests and their default scores, go to
# http://spamassassin.org/tests.html .
#
# score SYMBOLIC_TEST_NAME n.nn
USER_PREFS
;

  foreach my $test (keys %{$test2idx}) {
    print FILE "score $test ",$$weights[$$test2idx{$test}]*$factor,"\n";
    delete $$allTests{$test};
  }
  foreach my $test (keys %{$allTests}) {
    print FILE "score $test 0\n" unless ($test =~ /^__/);
  }
  print FILE "\nbayes\_path\t",$bayes_path,"\n" unless $smoOnly;
  close(FILE);
}
