#!/usr/bin/perl -w

# text entropy calculator
# (c)2003 Stepan Roh
#
# outputs CSV file with input file statistics including entropy
#
# usage: ./assign1a.pl file.txt > file.log

# "precise" sum calculator
# in: @numbers
# out: $sum of @numbers
sub sum(@);
sub sum(@) {
  return 0 if (!@_);
  return $_[0] if (@_ == 1);
  if (@_ % 2 != 0) {
    push (@_, 0);
  }
  my @res = ();
  for (my $i = 0; $i < @_; $i += 2) {
    my $v1 = $_[$i];
    my $v2 = $_[$i + 1];
    push (@res, $v1 + $v2);
  }
  return sum (@res);
}

# probability test (for probabilities which must count to 1)
# in: $test name, @probabilities
sub test_prob($@) {
  my ($name, @data) = @_;
  my $s = sum (@data);
  if (($s < 0.999995) || ($s > 1.000005)) {
    die "Testing ", $name, " probability failed: sum is $s\n";
  }
}

# log() with base 2
# in: $number
# out: $log number
sub log2($) {
  return log(shift) / log(2);
}

# power function
# in: $x, $y
# out: $x^$y
sub pow($$) {
  my ($x, $y) = @_;
  return exp($y * log($x));
}

# entropy computation
# in: @text
# out: %hash with
#    'text_size' => $text size on words
#    'text_size_in_chars' => $text size in characters
#    'chars_per_word' => $average number of chars per word
#    'words_num' => $number of distinct words
#    'freq_1_words_num' => $number of words with frequency 1
#    'most_freq_word' => $most frequented word
#    'most_freq_word_count' => $frequency of most frequented word
#    'cond_entropy' => $conditional entropy
#    'cond_perplexity' => $conditional perplexity (2^entropy)
sub comp_entropy (@) {
  my @text = @_;

  die "Input empty\n" if (!@text);

  print STDERR ".... computing input statistics\n";

  # input text size in words
  my $text_size = @text;
  # input text size in word pairs (bigrams) - we do not use any artificial start or end characters here
  my $text_size_in_pairs = @text - 1;

  # input text size in characters
  my $text_size_in_chars = 0;
  foreach $word (@text) {
    $text_size_in_chars += length ($word);
  }

  # hash (word -> count)
  my %word_count = ();
  # number of words which occurred only once in input text
  my $freq_1_words_num = 0;
  # most frequent word
  my $most_freq_word = '';
  # number of occurrences of most frequent word
  my $most_freq_word_count = 0;
  # hash (word -> no of occurrences as first word in pair)
  my %pair_first_count = ();
  # hash (word -> no of occurrences as second word in pair)
  my %pair_second_count = ();
  # hash (word[i-1]_word[i] -> count)
  my %pair_count = ();
  # hash (word -> probability of being first word in pair)
  my %pair_first_prob = ();
  # hash (word -> probability of being second word in pair)
  my %pair_second_prob = ();
  # hash (word[i-1]_word[i] -> joint probability)
  # P(a,b) => key 'a_b'
  my %pair_joint_prob = ();
  # hash (word[i-1]_word[i] -> conditional probability)
  # P(b|a) => key 'a_b'
  my %pair_cond_prob = ();

  foreach $word (@text) {
    $word_count{$word}++;
  }
  $words_num = keys %word_count;

  # statistics computation (most and least frequented words)
  foreach $word (keys %word_count) {
    if ($word_count{$word} > $most_freq_word_count) {
      $most_freq_word = $word;
      $most_freq_word_count = $word_count{$word};
    }
    if ($word_count{$word} == 1) {
      $freq_1_words_num++;
    }
  }

  # frequencies computation
  # all pairs are processed and three frequencies computed:
  #   $pair_first_count: freq. of word occurrence as a first word in pair
  #   $pair_second_count: freq. of word occurrence as a second word in pair
  #   $pair_count: freq. of pair occurence

  for (my $i = 0; $i < @text - 1; $i++) {
    my $w1 = $text[$i];
    my $w2 = $text[$i + 1];
    my $pair = $w1.'_'.$w2;
    $pair_first_count{$w1}++;
    $pair_second_count{$w2}++;
    $pair_count{$pair}++;
  }

  print STDERR ".... computing probabilities\n";

  # probabilities computation
  # each probability is computed as frequency / data size
  # data size is measured in pairs (one less than text size)

  foreach $word (keys %pair_first_count) {
    $pair_first_prob{$word} = $pair_first_count{$word} / $text_size_in_pairs;
  }

  test_prob ("first words", values %pair_first_prob);

  foreach $word (keys %pair_second_count) {
    $pair_second_prob{$word} = $pair_second_count{$word} / $text_size_in_pairs;
  }

  test_prob ("second words", values %pair_second_prob);

  foreach $pair (keys %pair_count) {
    $pair_joint_prob{$pair} = $pair_count{$pair} / $text_size_in_pairs;
  }

  test_prob ("word pairs joint", values %pair_joint_prob);

  # conditional probability computation
  # - computed as p(j|i) = p(i,j) / p(i);
  # - (notice: pair_cond_prob has key in form 'i_j' and not 'j_i' for p(j|i))

  foreach $pair (keys %pair_count) {
    my ($w1) = split ('_', $pair);
    $pair_cond_prob{$pair} = $pair_joint_prob{$pair} / $pair_first_prob{$w1};
  }

  print STDERR ".... computing entropy\n";

  # conditional entropy computation
  # - computed as a sum of weighted logs of each pair's conditional probability
  # - pairs not visible in text are skipped - they would result in zero anyway
  
  $cond_entropy = 0;
  foreach $pair (keys %pair_count) {
    $cond_entropy += $pair_joint_prob{$pair} * log2 ($pair_cond_prob{$pair});
  }
  $cond_entropy = -$cond_entropy;

  return {
    'text_size' => $text_size,
    'text_size_in_chars' => $text_size_in_chars,
    'chars_per_word' => $text_size_in_chars / $text_size,
    'words_num' => $words_num,
    'freq_1_words_num' => $freq_1_words_num,
    'most_freq_word' => $most_freq_word,
    'most_freq_word_count' => $most_freq_word_count,
    'cond_entropy' => $cond_entropy,
    'cond_perplexity' => pow (2, $cond_entropy),
  };
}

# some log (aka result) printing

@data_keys = (
  'text_size', 'text_size_in_chars', 'chars_per_word', 'words_num',
  'freq_1_words_num', 'most_freq_word', 'most_freq_word_count',
  'cond_entropy', 'cond_perplexity',
);
print "file, mess type, mess %, pass no, words no, chars no, chars per word, distinct words, freq 1 words, \
most freq word, most freq word count, entropy, perplexity\n";

sub print_result ($$$$%) {
  my ($file, $mess, $mess_pc, $pass, %data) = @_;
  print $file, ', ', $mess, ', ', $mess_pc, ', ', $pass;
  foreach $key (@data_keys) {
    my $val = $data{$key};
    $val = '_comma_' if ($val eq ',');
    print ', ', $val;
  }
  print "\n";
}

# mess setting - messing likelihood in percents
@mess_pcs = ( 10, 5, 1, 0.1, 0.01, 0.001 );
# mess setting - number of passes for each mess
$PASS_MAX = 10;

# process file and prints its statistics to stdout
# in: $file name
sub process_file ($) {
  my ($file) = @_;

  print STDERR "Processing file $file\n";

  open (IN, $file) || die "Unable to open file $file : $!\n";
  my @orig_input = map { chomp($_); $_ } <IN>;
  close (IN);

  print STDERR "Computing entropy of original input\n";

  # original data computation

  my $orig_data = comp_entropy (@orig_input);
  print_result ($file, '-', '-', '-', %$orig_data);

  # computing used characters and words

  my %chars_count = ();
  foreach $word (@orig_input) {
    map { $chars_count{$_}++ } split (//, $word);
  }

  my %words_count = ();
  foreach $word (@orig_input) {
    $words_count{$word}++;
  }

  print STDERR "Computing entropy with messed characters\n";

  # computing with messed data
  # - each mess is done 10-times
  # - all words from input are split to characters and there is a
  #   chance for each character to be randomly mapped to another one
  #   from the set of input text characters

  my @used_chars = sort keys %chars_count;
  foreach $mess_pc (@mess_pcs) {
    print STDERR ".... mess $mess_pc%\n";
    for (my $pass = 1; $pass <= $PASS_MAX; $pass++) {
      print STDERR ".... pass $pass\n";
      my @messed_input = ();
      foreach $word (@orig_input) {
        my @chars = split (//, $word);
        foreach $c (@chars) {
          if (rand (100) < $mess_pc) {
            $c = $used_chars[rand(@used_chars)];
          }
        }
        my $nword = join ('', @chars);
        push (@messed_input, $nword);
      }
      my $messed_data = comp_entropy (@messed_input);
      print_result ($file, 'char', $mess_pc, $pass, %$messed_data);
    }
  }

  print STDERR "Computing entropy with messed words\n";

  # computing with messed data
  # - each mess is done 10-times
  # - all words from input have chance to be randomly mapped to another one
  #   from the set of input text words

  my @used_words = sort keys %words_count;
  foreach $mess_pc (@mess_pcs) {
    print STDERR ".... mess $mess_pc%\n";
    for (my $pass = 1; $pass <= $PASS_MAX; $pass++) {
      print STDERR ".... pass $pass\n";
      my @messed_input = ();
      foreach $word (@orig_input) {
        my $nword = $word;
        if (rand (100) < $mess_pc) {
          $nword = $used_words[rand(@used_words)];
        }
        push (@messed_input, $nword);
      }
      my $messed_data = comp_entropy (@messed_input);
      print_result ($file, 'word', $mess_pc, $pass, %$messed_data);
    }
  }

  print STDERR "End processing file $file\n";

}

# is called automatically on Perl >=5.004
srand();

foreach $file (@ARGV) {
  process_file ($file);
}

1;
