#!/usr/local/bin/perl

###################################################################
# This code is to compute information content                     #
# Usage: pvalue.pl motifgroup_file fas_sequence_file              #
#                                                                 #
#                                                                 #
###################################################################
#$file = $ARGV[0];
#@mi   = get_file_data("$file");

#compute_ic(@mi);

sub compute_ic {
my @mi = @_;

$motif_count =0;

#-------Beginning of PWM processing ----------------
foreach $mi (@mi)
  {
    chomp($mi);
    $mi =~ s/\s//g;
    $L = $mi;

    #for motif instances count
    @words = split( /\W+/, $mi );
    $motif_count += @words;
  }

$w = length($L);
@A = ();
@T = ();
@C = ();
@G = ();

for ( my $j = 0 ; $j < $w ; $j++ )
  {

    # Initialize the base counts.
    $a = 0;
    $c = 0;
    $g = 0;
    $t = 0;

    foreach $mi (@mi)
      {
        chomp($mi);
        $L = $mi;
        $sb = substr( $L, $j, 1 );
        while ( $sb =~ /a/ig ) { $a++ }
        while ( $sb =~ /t/ig ) { $t++ }
        while ( $sb =~ /c/ig ) { $c++ }
        while ( $sb =~ /g/ig ) { $g++ }

      }
    push( @A, $a );
    push( @T, $t );
    push( @C, $c );
    push( @G, $g );
  }

$sumA = sumArray(@A);
$sumT = sumArray(@T);
$sumC = sumArray(@C);
$sumG = sumArray(@G);

#print "\nNumber of Motif Instances: $motif_count\n";
#print "Motif Length             : $w\n";
#print "Frequency of A=$sumA T=$sumT  C=$sumC G=$sumG\n\n";
#print "-------------- PWM (Actual Count)  ---------------------\n";
#print "A = @A\n";
#print "T = @T\n";
#print "C = @C\n";
#print "G = @G\n";
#print "--------------------------------------------------------\n";

#----- Begin PWM normalization step 1 (divide elements of each column with the Sum of each column position -------------

@m = ();
my @Anm1 =();
my @Tnm1 =();
my @Cnm1 =();
my @Gnm1 =();
my @sPos =();

#summing up A,T,C,G for all position
for ( my $b = 0 ; $b < $w ; $b++ )
  {
    $sumPos = $A[$b] + $T[$b] + $C[$b] + $G[$b];
    push( @sPos, $sumPos );

    $nm1A = $A[$b] / $sPos[$b];
    $nm1T = $T[$b] / $sPos[$b];
    $nm1C = $C[$b] / $sPos[$b];
    $nm1G = $G[$b] / $sPos[$b];

    push( @Anm1, $nm1A );
    push( @Tnm1, $nm1T );
    push( @Cnm1, $nm1C );
    push( @Gnm1, $nm1G );

    #$sumNm1 = $Anm1[$b]+$Tnm1[$b]+$Cnm1[$b]+$Gnm1[$b];
    #push(@Nm1, $sumNm1);

    #---Finding maximum value from each position ------

    if ( $A[$b] > $T[$b] && $A[$b] > $C[$b] && $A[$b] > $G[$b] )
      {
        push( @m, $Anm1[$b] );
      }

    elsif ( $T[$b] > $C[$b] && $T[$b] > $A[$b] && $T[$b] > $G[$b] )
      {
        push( @m, $Tnm1[$b] );
      }

    elsif ( $C[$b] > $G[$b] && $C[$b] > $A[$b] && $C[$b] > $T[$b] )
      {
        push( @m, $Cnm1[$b] );
      }
    else { push( @m, $Gnm1[$b] ); }

  }

#$countMax = countArray(@m);
$M = sumArray(@m);

#print join("\n", @m), "\n";
my @Anm2 =();
my @Tnm2 =();
my @Cnm2 =();
my @Gnm2 =();

#   print "\nFirst Normalization\n";
#   print "A = "; for my $i (@Anm1) {printf "%2.2f ", $i;} print "\n";
#   print "T = "; for my $i (@Tnm1) {printf "%2.2f ", $i;} print "\n";
#   print "C = "; for my $i (@Cnm1) {printf "%2.2f ", $i;} print "\n";
#   print "G = "; for my $i (@Gnm1) {printf "%2.2f ", $i;} print "\n";

for ( my $fpwm = 0 ; $fpwm < $w ; $fpwm++ )
  {

    $nm2A = $Anm1[$fpwm] / $M;
    $nm2T = $Tnm1[$fpwm] / $M;
    $nm2C = $Cnm1[$fpwm] / $M;
    $nm2G = $Gnm1[$fpwm] / $M;

    push( @Anm2, $nm2A );
    push( @Tnm2, $nm2T );
    push( @Cnm2, $nm2C );
    push( @Gnm2, $nm2G );

  }

#print join("\n", @Anm2), "\n";

#print
#"\n\n-------------- PWM (Normalized) - M value: $M #-----------------------------------------\n";
#print "A = ";
#for my $i (@Anm2) { printf "%2.2f ", $i; }
#print "\n";
#print "T = ";
#for my $i (@Tnm2) { printf "%2.2f ", $i; }
#print "\n";
#print "C = ";
#for my $i (@Cnm2) { printf "%2.2f ", $i; }
#print "\n";
#print "G = ";
#for my $i (@Gnm2) { printf "%2.2f ", $i; }
#print "\n";

#printf
#"------------------------------------------------------------------------------------------\n";

#-----  End of PWM generation -------------------------------------------------------------------------

#----------------- Begin computing Information content for motif instances ---------------------------
#$plogp = ($Anm1[0] * log_base(2,4*$Anm1[0]))+
#($Cnm1[0] * log_base(2,4*$Cnm1[0]))+($Gnm1[0] * log_base(2,4*$Gnm1[0]));
#print "$plogp\n";

my @pLogpA = ();
my @pLogpT = ();
my @pLogpC = ();
my @pLogpG = ();

for ( my $i = 0 ; $i < $w ; $i++ )
  {
    if ( $Anm1[$i] > 0 )
      {
        $plogpA = $Anm1[$i] * log_base( 2, 4 * $Anm1[$i] );
        push( @pLogpA, $plogpA );
      }
    if ( $Tnm1[$i] > 0 )
      {
        $plogpT = $Tnm1[$i] * log_base( 2, 4 * $Tnm1[$i] );
        push( @pLogpT, $plogpT );
      }
    if ( $Cnm1[$i] > 0 )
      {
        $plogpC = $Cnm1[$i] * log_base( 2, 4 * $Cnm1[$i] );
        push( @pLogpC, $plogpC );
      }
    if ( $Gnm1[$i] > 0 )
      {
        $plogpG = $Gnm1[$i] * log_base( 2, 4 * $Gnm1[$i] );
        push( @pLogpG, $plogpG );
      }
  }

#print join("\n", @pLogpA), "\n";
#print join("\n", @pLogpT), "\n";
#print join("\n", @pLogpC), "\n";
#print join("\n", @pLogpG), "\n";

$sumACTGpilogpi =
  ( sumArray(@pLogpA) + sumArray(@pLogpT) + sumArray(@pLogpC) +
      sumArray(@pLogpG) );
     return $sumACTGpilogpi;
#printf "Total Information Content = %.6f\n", $sumACTGpilogpi;

 }

#---------------Subroutines----------------------------
sub zup
  {
    join "\n" => map {
        join " " => map { shift @$_ }
          @_
    } @{ $_[0] };
  }

sub log_base
  {
    my ( $base, $value ) = @_;
    return log($value) / log($base);
  }

sub sumArray
{
  my @arr = @_;
  my $sum = 0;
  my $count = $#arr + 1;
  for(my $i=0;$i<$count;$i++){
     $sum += $arr[$i];
  }
  return $sum;
}

sub sum
  {
    my @params = @_;
    my $Key;
    my $Total = 0;

    foreach $Key (@params)
      {
        $Total += $Key;
      }
    return $Total;
  }

sub countArray
  {
    my @params = @_;
    my $Key;
    my $count;
    foreach $Key (@params) { $count++ }
    return $count;
  }

sub get_file_data
  {

    my ($filename) = @_;

    use strict;
    use warnings;

    # Initialize variables
    my @filedata = ();

    unless ( open( GET_FILE_DATA, $filename ) )
      {
        print STDERR "Cannot open file \"$filename\"\n\n";
        exit;
      }

    @filedata = <GET_FILE_DATA>;

    close GET_FILE_DATA;

    return @filedata;
  }

sub extract_sequence_from_fasta_data
  {

    my (@fasta_file_data) = @_;

    use strict;
    use warnings;

    # Declare and initialize variables
    my $sequence = '';

    foreach my $line (@fasta_file_data)
      {

        # discard blank line
        if ( $line =~ /^\s*$/ )
          {
            next;

            # discard comment line
          }
        elsif ( $line =~ /^\s*#/ )
          {
            next;

            # discard fasta header line
          }
        elsif ( $line =~ /^>/ )
          {
            next;

            # keep line, add to sequence string
          }
        else
          {
            $sequence .= $line;
          }
      }

    # remove non-sequence data (in this case, whitespace) from $sequence string
    $sequence =~ s/\s//g;

    return $sequence;
  }

sub enum_lmers
  {
    my ( $sequence, $lmer_size ) = @_;
    use strict;
    use warnings;

    my @lmer_array = ();
    my $lmer;
    my $enum_size;

    $enum_size = length($sequence) - $lmer_size + 1;

    #print "$enum_size\n";
    for ( my $pos = 0 ; $pos < $enum_size ; $pos++ )
      {
        $lmer = substr( $sequence, $pos, $lmer_size );

        #print "$lmer\n";
        push( @lmer_array, $lmer );

      }
    return @lmer_array;
  }


