#!/usr/bin/perl

use strict;
use warnings;

use Getopt::Long;
use Pod::Usage;
use File::Path;

my ( $help, $man );
my $verbose = 0;
my $input_training_file;
my $input_testing_file;
my $model_dir;
my $feature_dir;
my $result_dir;
my $do_stemming;
my $vocab_file;
my $log_odds_cutoff = 0;
my $feature_choice = "mi";

GetOptions(
	   'help|?' => \$help,
	   man      => \$man,
	   'verbose+' => \$verbose,
	   'input-training-file=s' => \$input_training_file,
	   'input-testing-file=s' => \$input_testing_file,
	   'model-dir=s' => \$model_dir,
	   'feature-dir=s' => \$feature_dir,
	   'result-dir=s' => \$result_dir,
	   'stem' => \$do_stemming,
	   'vocab-file=s' => \$vocab_file,
	   'log-odds-cutoff=s' => \$log_odds_cutoff,
           'feature-type=s' => \$feature_choice,
  ) or pod2usage(2);


pod2usage(1) if ($help);
pod2usage( -exitstatus => 0, -verbose => 2 ) if ($man);
pod2usage("-input-training-file or -input-testing-file required!\n")
    unless($input_training_file or $input_testing_file);
pod2usage("only one of -input-training-file and -input-testing-file allowed!\n")
    if($input_training_file and $input_testing_file);
pod2usage("-model-dir required!\n")
    unless($model_dir);
pod2usage("-feature-dir required!\n")
    unless($feature_dir);
pod2usage("-vocab-file required when testing!\n")
    if($input_testing_file and not $vocab_file);
pod2usage("-result-dir required when testing!\n")
    if($input_testing_file and not $result_dir);

$vocab_file = "$feature_dir/vocab" unless($vocab_file);

# print start time
system("date");

# read data
my ($ref, $text);
if($input_training_file) {
    ($ref, $text) = preprocess($input_training_file);
}
else {
    ($ref, $text) = preprocess($input_testing_file);
}

# stem words
stem($text) if($do_stemming);

# run word counting
my $counts = count_words($text);

# run feature generation
make_mi_features($counts,$ref,$vocab_file,$feature_dir);

if($input_training_file) {
    # train svms
    build_svms($model_dir,"$feature_dir/train.*");
}
else {
    # test svms
    classify_testset($model_dir, "$feature_dir/testset", $ref, $result_dir);
}

# print finish time
system("date");

########################################
######## Sub routines
########################################

sub preprocess {
    my ($infile, $outfile, $keyfile) = @_;

    open(my $infh, $infile)      || die "Could not open $infile\n";

    my @text; my @ref;
    while(<$infh>) {
	chomp;

	#remove leading whitespace
	$_ =~ s/^\s*//;

	#remove whitespace around delimeter
	$_ =~ s/\s*\|\s*/|/g;

	my @line = split(/\|/, $_);
	if($#line != 1) {
	    print STDERR "Found $#line columns in: ",  join(" | ", @line), "\n";
	    print STDERR "Expected only 2 columns (with delimiter '|')\n";
	    next;
	}

	push @ref, $line[0];

	# clean up text (no quotes)
	$line[1] =~ tr/A-Z/a-z/;
	$line[1] =~ y/a-z0-9/ /cs;
	$line[1] =~ s/^\s+//;

	my @words = split(/\s+/, $line[1]);
	push @text, \@words;
    }

    return (\@ref, \@text);
}

## Porter word stemming (code from doc2mat.pl, CLUTO preprocessing script from George Karypis karypis@cs.umn.edu)

sub stem {
    my ($textlines) = @_;

    #==============================================================================
    # Setup the data-structures for the stemmer and initialize it
    #==============================================================================
    my %step2list = ('ational'=>'ate', 'tional'=>'tion', 'enci'=>'ence', 'anci'=>'ance', 
		  'izer'=>'ize', 'bli'=>'ble', 'alli'=>'al', 'entli'=>'ent', 'eli'=>'e', 
		  'ousli'=>'ous', 'ization'=>'ize', 'ation'=>'ate', 'ator'=>'ate', 
		  'alism'=>'al', 'iveness'=>'ive', 'fulness'=>'ful', 'ousness'=>'ous', 
		  'aliti'=>'al', 'iviti'=>'ive', 'biliti'=>'ble', 'logi'=>'log');

    my %step3list = ('icate'=>'ic', 'ative'=>'', 'alize'=>'al', 'iciti'=>'ic', 'ical'=>'ic', 
		  'ful'=>'', 'ness'=>'');

    my $c =    "[^aeiou]";          # consonant
    my $v =    "[aeiouy]";          # vowel
    my $C =    "${c}[^aeiouy]*";    # consonant sequence
    my $V =    "${v}[aeiou]*";      # vowel sequence

    my $mgr0 = "^(${C})?${V}${C}";               # [C]VC... is m>0
    my $meq1 = "^(${C})?${V}${C}(${V})?" . '$';  # [C]VC[V] is m=1
    my $mgr1 = "^(${C})?${V}${C}${V}${C}";       # [C]VCVC... is m>1
    my $_v   = "^(${C})?${v}";                   # vowel in stem

    my @stem_params = (\%step2list, \%step3list, $c, $v, $C, $V, $mgr0, $meq1, $mgr1, $_v);

    for my $line (@{$textlines}) {
	for(my $n=0; $n<@{$line}; $n++) {
	    $line->[$n] = stem_word($line->[$n], @stem_params);
	}
    }
}

sub stem_word {
    my ($word, $step2list, $step3list, $c, $v, $C, $V, $mgr0, $meq1, $mgr1, $_v) = @_;

    my ($stem, $suffix, $firstch);
    my $w = $word;
    if (length($w) < 3) { return $w; } # length at least 3
    # now map initial y to Y so that the patterns never treat it as vowel:
    $w =~ /^./; $firstch = $&;
    if ($firstch =~ /^y/) { $w = ucfirst $w; }

    # Step 1a
    if ($w =~ /(ss|i)es$/) { $w=$`.$1; }
    elsif ($w =~ /([^s])s$/) { $w=$`.$1; }
    # Step 1b
    if ($w =~ /eed$/) { if ($` =~ /$mgr0/o) { chop($w); } }
    elsif ($w =~ /(ed|ing)$/) {
	$stem = $`;
	if ($stem =~ /$_v/o) {
	    $w = $stem;
	    if ($w =~ /(at|bl|iz)$/) { $w .= "e"; }
	    elsif ($w =~ /([^aeiouylsz])\1$/) { chop($w); }
	    elsif ($w =~ /^${C}${v}[^aeiouwxy]$/o) { $w .= "e"; }
	}
    }
    # Step 1c
    if ($w =~ /y$/) { $stem = $`; if ($stem =~ /$_v/o) { $w = $stem."i"; } }
    
    # Step 2
    if ($w =~ /(ational|tional|enci|anci|izer|bli|alli|entli|eli|ousli|ization|ation|ator|alism|iveness|fulness|ousness|aliti|iviti|biliti|logi)$/) {
	$stem = $`; $suffix = $1;
	if ($stem =~ /$mgr0/o) { $w = $stem . $step2list->{$suffix}; }
    }
    
    # Step 3
    if ($w =~ /(icate|ative|alize|iciti|ical|ful|ness)$/) {
	$stem = $`; $suffix = $1;
	if ($stem =~ /$mgr0/o) { $w = $stem . $step3list->{$suffix}; }
    }

    # Step 4
    if ($w =~ /(al|ance|ence|er|ic|able|ible|ant|ement|ment|ent|ou|ism|ate|iti|ous|ive|ize)$/)
    { $stem = $`; if ($stem =~ /$mgr1/o) { $w = $stem; } }
    elsif ($w =~ /(s|t)(ion)$/)
    { $stem = $` . $1; if ($stem =~ /$mgr1/o) { $w = $stem; } }

    #  Step 5
    if ($w =~ /e$/) {
	$stem = $`;
	if ($stem =~ /$mgr1/o or
	    ($stem =~ /$meq1/o and not $stem =~ /^${C}${v}[^aeiouwxy]$/o))
	{ $w = $stem; }
    }
    if ($w =~ /ll$/ and $w =~ /$mgr1/o) { chop($w); }

    # and turn initial Y back to y
    if ($firstch =~ /^y/) { $w = lcfirst $w; }
    return $w;
}

sub count_words {
    my ($textlines) = @_;

    my @counts;
    for my $line (@{$textlines}) {
	my @line = @{$line};
	my %words;
	my $num_words = @line;

	for(my $n=0;$n<@line;$n++) {
	    $words{$line[$n]} ++;
	}

	$words{'!numwords!'} = $num_words;
	push @counts, \%words;
	#print $outfh "!numwords!:$num_words ";
	#foreach my $word (keys %words) {
	    #print $outfh "$word:$words{$word} ";
	#}
	#print $outfh "\n";
    }
    return \@counts;
}

#
# make_mi_features -- make mutual information features
#
sub make_mi_features {
    my ($counts, $ref, $vocab_file, $outdir) = @_;

    my (%categories);
    print "Get key\n" if ($verbose);
    get_key($ref, \%categories);
    mkpath([$outdir]);

    # collect the word frequency data

    my @all_words;
    my %word_ids;
    my @id_words;
    my @my_lines;
    my @my_line_cats;
    my $vocabfh;
    if($input_testing_file) {
	open($vocabfh, "$vocab_file")
	    or die "Could not open $vocab_file\n"; 
	read_vocab($vocabfh, \@all_words, \%word_ids);
    }
    else {
	if(-f $vocab_file) {
	    warn "Vocab $vocab_file already exists, overwriting!\n";
	}
	open($vocabfh, ">$vocab_file")
	    or die "Could not open $vocab_file\n"; 
    }

    # open output file(s) for features
    my %my_files = create_files(\%categories, $outdir);
    print "Collect frequency\n" if($verbose);
    collect_freq($counts, $vocabfh, \@all_words, \%word_ids, \@id_words, \@my_lines, \%my_files);

    # output our features
    if(not $input_testing_file) {
	print "Output fetures\n" if($verbose);
	output_features(\@my_lines, $ref, \%my_files, \@all_words, $log_odds_cutoff);
    }
}


sub get_key {
    my ($ref, $categories) = @_;
    for my $cat (@{$ref}) {
	if(exists $categories->{$cat}) {
	    $categories->{$cat} += 1;
	}
	else {
	    print "Add category $cat\n" if($verbose);
	    $categories->{$cat} = 1;
	}
    }
}

sub create_files {
    my ($categories, $out_dir) = @_;
    my %my_files;
    my $test_fh;
    my $testkey_fh;
    if($input_testing_file) {
	open($test_fh, ">$out_dir/testset")
	    or die "Could not open $out_dir/testset: $!\n";
	open($testkey_fh, ">$out_dir/key.testset")
	    or die "Could not open $out_dir/key.testset: $!\n";
    }

    # assign a feature output file for each category
    #  if we are testing, it will all be the same file
    foreach my $cat (keys %{$categories}) {
	print "Found $cat $categories->{$cat}\n" if($verbose);
	unless($input_testing_file) {
	    open(my $fh, ">$out_dir/train.$cat")
	        or die "Could not open file $out_dir/train.$cat\n";
	    open(my $keyfh, ">$out_dir/key_train.$cat")
	        or die "Could not open file $out_dir/key_train.$cat\n";
	    $my_files{$cat} = $fh;
	    $my_files{"key.$cat"} = $keyfh;
	}
	else {
	    $my_files{$cat} = $test_fh;
	    $my_files{"key.$cat"} = $testkey_fh;
	}
    }
    return %my_files;
}

sub collect_freq {
    my ($counts, $vocabfh, $all_words, $word_ids, $id_words, $my_lines, $my_files) = @_;

    my $vocab_cnt=1;
    my $my_example_cnt = 0;
    
    for(my $n=0; $n<@{$counts}; $n++) {
	my $count = $counts->[$n];
	my @my_line;
	my $numwords = $count->{'!numwords!'};
	$my_example_cnt++;
	for my $word (keys %{$count}) {
	    my $count =  $count->{$word};
	    my $word_freq = $count / $numwords;
	    if(not $input_testing_file) {
                unless(exists $word_ids->{$word}) {
		    $all_words->[$vocab_cnt] = $word_freq;
		    $id_words->[$vocab_cnt] = $word;
		    $word_ids->{$word} = $vocab_cnt++;
	        }
	        else {
		    $all_words->[$word_ids->{$word}] += $word_freq;
	        }
	    }
	    push @my_line, [$word_ids->{$word},$word_freq] if(exists $word_ids->{$word});
	}

	# if training, store lines to output per category
	if(not $input_testing_file) {
	    push @{$my_lines}, \@my_line;
	}
	# if testing, output features here (no need to separate into categories)
	else {
	    output_feature_line(\@my_line,$ref->[$n],$my_files,$all_words, $log_odds_cutoff);
	}
    }

    if(not $input_testing_file) {
        #compute vocab averages (these average over examples, and frequencies
        #are not weighted by how many words were in a particular example)
        for(my $n=1; $n<@{$all_words};$n++) {
	    warn "Index $n $all_words->[$n]\n" if($verbose>1);
	    $all_words->[$n] = $all_words->[$n] / $my_example_cnt;
	    print $vocabfh "$id_words->[$n] $word_ids->{$id_words->[$n]} $all_words->[$n]\n";
        }
    }
}

sub output_features {
    my ($my_lines, $ref, $my_files, $all_words, $log_odds_cutoff) =@_;

    for(my $n=0; $n<@{$my_lines}; $n++) {
	my @my_line = @{$my_lines->[$n]};
	output_feature_line(\@my_line,$ref->[$n],$my_files,$all_words, $log_odds_cutoff);
    }
}

sub output_feature_line {
    my ($my_line, $my_cat, $my_files, $all_words, $log_odds_cutoff) =@_;

    my @my_line = @{$my_line};
    my $my_file;
    my $my_key_file;
    if(exists $my_files->{$my_cat}) {
	$my_file = $my_files->{$my_cat};
	$my_key_file = $my_files->{"key.$my_cat"};
    } 
    else {
	die "Could not find my cat '$my_cat'\n";
    }
    @my_line = sort {$a->[0] <=> $b->[0]} @my_line;
    print $my_file "0 " if($input_testing_file); #print dummy class for testing
    for(my $m=0; $m<@my_line;$m++) {
	my ($word_id, $word_freq) = @{$my_line[$m]};
	my $word_log_odds = log($word_freq / $all_words->[$word_id]);
	print $my_file "$word_id:$word_log_odds " if($word_log_odds > $log_odds_cutoff);
    }
    print $my_file "\n";
    print $my_key_file "$my_cat\n";
}

sub read_vocab {
    my ($vocab_fh, $all_words, $word_ids) = @_;
    while(<$vocab_fh>) {
        chomp;
        my @line = split;
        my $word = $line[0];
        my $word_id = $line[1];
        my $word_freq = $line[2];
        $all_words->[$word_id] = $word_freq;
        $word_ids->{$word} = $word_id;
    }
    close($vocab_fh);
}


sub build_svms {
    my ($model_dir, $train_glob) = @_;

    mkpath([$model_dir]);
    my @train_files = glob "$train_glob";

    # read in training feature lines
    my @train_cats;
    my @train_names;
    foreach my $train_file (@train_files) {
	open(my $trainfh, $train_file) || die "Could not open file $train_file\n";
	my @lines;
	while(<$trainfh>) {
	    chomp;
	    push @lines, $_;
	}
	close($trainfh);

	my $cat;
	if($train_file =~ /train\.(.*)$/) {
	    $cat = $1;
	}
	else {
	    die "Could not determine training category name from '$train_file'\n";
	}
	$train_cats[@train_cats] = \@lines;
	$train_names[@train_names] = $cat;
    }


    # build SVM pairwise training files
    for(my $n=0; $n < @train_cats; $n++) {
	for(my $m=$#train_cats; $m>$n; $m--) {
	    print "Build SVM for $train_names[$n] vs $train_names[$m]\n";
	    open(my $bldfh, ">$model_dir/bld")
		or die "Could not open $model_dir/bld\n";
	    my @train_lines1 = @{$train_cats[$n]};
	    my @train_lines2 = @{$train_cats[$m]};
	    my $max_cnt = @train_lines1;
	    $max_cnt = @train_lines2 if(@train_lines2 > @train_lines1);
	    $max_cnt = @train_lines1;
	    for(my $l=0;$l<$max_cnt;$l++) {
		my $index = $l;
		while($index > $#train_lines1) {
		    $index -= $#train_lines1;
		}
		print $bldfh "1 $train_lines1[$index]\n";
	    }
	    $max_cnt = @train_lines2;
	    for(my $l=0;$l<$max_cnt;$l++) {
		my $index = $l;
		while($index > $#train_lines2) {
		    $index -= $#train_lines2;
		}
		print $bldfh "-1 $train_lines2[$index]\n";
	    }
	    close($bldfh);
	    
	    #train svms
	    my $command = "svm_learn $model_dir/bld $model_dir/svm.$train_names[$n].$train_names[$m] > tmp";
	    print "$command\n" if($verbose);
	    system($command) == 0 or 
	    	die "failed to execute: \"$command\" with error: $?\n";
	    unlink "$model_dir/bld";
	}
    }
}


sub classify_testset {
    my ($model_dir, $test_file, $test_ref, $output_dir) = @_;
    my $score_dump = 0;
    mkpath([$output_dir]);

    # get all the model files
    my @models = `ls $model_dir`;

    # run svm classification for each svm model 
    my @results;
    my @all_results;
    print "Found models: ", join(" ", @models), "\n" if($verbose>1);
    
    foreach my $model (@models) {
	chomp $model;
	my $pos_model; my $neg_model;
	if($model =~ /svm\.(.*)\.(.*)$/) {
	    $pos_model = $1;
	    $neg_model = $2;
	}
	else {
	    die "Couldn't match model name '$model'\n";
	}

	print "Run model $model\n";
	my $command="svm_classify $test_file $model_dir/$model $output_dir/$model.results > tmp";
	print "$command\n" if($verbose);
	system($command) == 0 or 
	    	die "failed to execute: \"$command\" with error: $?\n";

	open(my $resultfh, "$output_dir/$model.results")
	    or die "Could not open file $output_dir/$model.results\n";
	my $line_cnt=0;
	while(<$resultfh>) {
	    chomp;
	    my $hash_pointer;
	    unless(defined $results[$line_cnt]) {
		my %hash;
		my %all;
		$results[$line_cnt] = \%hash;
		$all_results[$line_cnt] = \%all if($score_dump);
	    }

	    my $winner;
	    my $loser;
	    my $score;
	    if($_ > 0) {
		$winner = $pos_model;
		$loser = $neg_model;
		$score = $_;
	    }
	    else {
		$winner = $neg_model;
		$loser = $pos_model;
		$score = $_ * -1;
	    }
	
	    # record winning model
	    ${$results[$line_cnt]}{$winner} ++;    
	    # record model pair
	    ${$all_results[$line_cnt]}{"$winner.$loser"} = $score if($score_dump);
	    
	    $line_cnt++;
	}
    }

    # read in test key
#     open(my $keyfh, $test_key) || die "Could not open file $test_key\n";
#     my @key;
#     while(<$keyfh>) {
# 	chomp;
# 	my @line = split;
# 	push @key, \@line;
#     }

    open(my $rankfh, ">$output_dir/rank")
	or die "Could not open $output_dir/rank\n";

    my $scorefh;
    if($score_dump) {
	open($scorefh, ">$output_dir/compare-scores")
	    or die "Could not open $output_dir/compare-scores\n";
    }
    
    die "Different number of lines: $#results result lines and $#{$test_ref} key lines\n"
	unless($#results eq $#{$test_ref});
    

    # collect results
    my %confusion_table;
    my $correct_cnt;
    for(my $n=0; $n<@results; $n++) {
	my $max=-1;
	my $max_model='';
	my @rank;
	foreach my $model (keys %{$results[$n]}) {
	    if(${$results[$n]}{$model} > $max) {
		$max = ${$results[$n]}{$model};
		$max_model = $model;
	    }
	    push @rank, [ $model, ${$results[$n]}{$model} ];
	}
	
	@rank = sort {$b->[1] <=> $a->[1]} @rank;
	print $rankfh "$test_ref->[$n] || ";

	# print out the models in rank order
	for(my $m=0; $m<@rank; $m++) {
	    print $rankfh "$rank[$m]->[0] ";
	}
	print $rankfh "\n";

	#print out scores to compare against winning model
	if($score_dump) {
	    my $first_second = "$rank[0]->[0].$rank[1]->[0]";
	    my $first_third = "$rank[0]->[0].$rank[2]->[0]";
	    my $second_third = "$rank[1]->[0].$rank[2]->[0]";
	    
	    print $scorefh "$test_ref->[$n] | $first_second | ",  
	    get_score($all_results[$n], $first_second),
	    " | $first_third | ",  
	    get_score($all_results[$n], $first_third),
	    " | $second_third | ",  
	    get_score($all_results[$n], $second_third), "\n";
	}

	$correct_cnt++ if($max_model eq $test_ref->[$n]);
	$confusion_table{$test_ref->[$n]}{$max_model} += 1;
    }
    
    print "Accuracy: ", $correct_cnt / @results, "\n";

    my %cat_totals;


    ## print out confusion table of results
    print "\n\nCount of confusion occurances, true classes are\n";
    print "    the first column, predictions are the first row\n";
    
    print "        ", join("    ", sort keys %confusion_table), "\n";
    foreach my $truth (sort keys %confusion_table) {
	printf "% 2s: ", $truth;
	foreach my $prediction (sort keys %confusion_table ) {
	    $confusion_table{$truth}{$prediction} = 0
		unless(exists $confusion_table{$truth} and $confusion_table{$truth}{$prediction});
	    printf "% 5d ", $confusion_table{$truth}{$prediction};
	    $cat_totals{$truth} += $confusion_table{$truth}{$prediction};
	}
	print "\n";
    }

    print "\n\nPercentage relative to number of truth examples\n";
    print "        ", join("    ", sort keys %confusion_table), "\n";
    foreach my $truth (sort keys %confusion_table) {
	printf "% 2s: ", $truth;
	foreach my $prediction (sort keys %confusion_table ) {
	    printf "%0.3f ", $confusion_table{$truth}{$prediction} / $cat_totals{$truth};
	}
	print "\n";
    }

    print "\n\nPercentage of overall examples\n";
    print "        ", join("    ", sort keys %confusion_table), "\n";
    foreach my $truth (sort keys %confusion_table) {
	printf "% 2s: ", $truth;
	foreach my $prediction (sort keys %confusion_table ) {
	    printf "%0.3f ", $confusion_table{$truth}{$prediction} / @results;
	}
	print "\n";
    }
}

# return SVM one-vs-one score
sub get_score {
    my ($results, $pair) = @_;
    if(exists ${$results}{$pair}) {
        return ${$results}{$pair};
    }
    else {
        my ($winner, $loser) = split('\.', $pair);
        my $reverse_pair = "$loser.$winner";
        if(exists ${$results}{$reverse_pair}) {
            return (-1 * ${$results}{$reverse_pair});
        }
        else {
            die "Couldn't find score for pair: $pair or $reverse_pair\n";
        }
    }
}

__END__

=head1 NAME

  run-svm - control script for multiclass svm training with feature generation from text

=head1 SYNOPSIS
  run-svm version 0.1
  Copyright 2004-2007 Dustin Hillard and Stephen Purpura

  Contact us at sp559@cs.cornell.edu if you wish to use this script for commercial purposes.
  Modification and use for non-commercial purposes is allowed, but please cite.

  The authors make no warranties about the applicability of this software to any outcome.

  run-svm [options]

  Options:
     -help                brief help message
     -man                 full documentation
    --verbose             more verbose to STDERR
    --input-training-file file that has data to train on
    --input-testing-file  file that has data to test on
    --model-dir           directory with trained svm models to use in testing
    --output-dir          directory we'll put the files in
    --feature             feature options include {mi, tf, tfidf, tfidfL1, tfidfL2, tfidfpivot}

=head1 OPTIONS

=over

=item B<--help>

=item B<-?>

Show this help message.

=item B<--man>

Show the manual page for this script.

=item B<--verbose>

Repeatable option. Report more of what we're doing.

=back

=head1 DESCRIPTION

This control script front-ends SVMlight to perform multiclass classification using text
as features.

=head2 CAVEATS

(1) This program has only been tested under Linux and MacOS

(2) svm_learn and svm_classify must be in the path.  An example for bash:

	export PATH=$PATH:/home/iuser/svm_scripts

=head2 TO DO

=head1 AUTHORS

Dustin Hillard E<lt>hillard@ee.washington.eduE<gt>

Stephen Purpura E<lt>sp559@cs.cornell.eduE<gt>

=cut
