#!/usr/bin/perl
# Paul Smith paul@paladinscientific.com paul@smithops.net
# Program to calculate charge vs. pH for a protein input as a FASTA format sequence
# Usage : ./titrate.pl FASTA.file
# output is to STDOUT as unsorted pairs of values
# pKa values are program constants are input below.
# the following table is from http://isoelectric.ovh.org/files/isoelectric-point-theory.html
# the user can adust pKa values as needed


#Amino acid  	NH2  	COOH  	C  	D  	E  	H  	K  	R  	Y
#EMBOSS 	8.6 	3.6 	8.5 	3.9 	4.1 	6.5 	10.8 	12.5 	10.1
#DTASelect 	8.0 	3.1 	8.5 	4.4 	4.4 	6.5 	10.0 	12.0 	10.0
#Solomon 	9.6 	2.4 	8.3 	3.9 	4.3 	6.0 	10.5 	12.5 	10.1
#Sillero 	8.2 	3.2 	9.0 	4.0 	4.5 	6.4 	10.4 	12.0 	10.0
#Rodwell 	8.0 	3.1 	8.33 	3.68 	4.25 	6.0 	11.5 	11.5 	10.07
#Patrickios 	11.2 	4.2 	- 	4.2 	4.2 	- 	11.2 	11.2 	-
#Wikipedia 	8.2 	3.65 	8.18 	3.9 	4.07 	6.04 	10.54 	12.48 	10.46


use strict;
use warnings;
use constant MIN_PH => 1;
use constant MAX_PH => 12;
use constant GRID => 0.05;
use constant D_PKA => 4.1;
use constant E_PKA => 4.1;
use constant K_PKA => 10.5;
use constant R_PKA => 12.0;
use constant C_PKA => 8.5;
use constant Y_PKA => 10.1;
use constant AMINO_PKA => 8.5;
use constant CARBOXY_PKA => 3.8;

sub parce_fasta {
    #reads in a named file, ignores lines beginning with ">"
    #and counts the number of each type of amino acid
    my @args = @_;
    my $filename = $args[0];
    my %data_hash = (
		     A => 0,
		     C => 0,
		     D => 0,
		     E => 0,
		     F => 0,
		     G => 0,
		     H => 0,
		     I => 0,
		     K => 0,
		     L => 0,
		     M => 0,
		     N => 0,
		     P => 0,
		     Q => 0,
		     R => 0,
		     S => 0,
		     T => 0,
		     V => 0,
		     W => 0,
		     Y => 0,
		     UNK => 0
		     );
    open (DATA, $filename );
    while (<DATA>){
	my $line = $_;
	#kill all whitespace
	$line =~ s/\s+//g;
	#ignore ">"
	if ($line !~ /^>/){
	    #convert to uppercase
	    $line = uc($line);
	    my @list = split //, $line;
	    for (my $i = 0 ; $i < scalar (@list); $i++){
		my $key = $list[$i];
		if (exists $data_hash{$key}){
		    $data_hash{$key}++;
		} else {
		    $data_hash{UNK}++;
		}
	    }
	}
    
    }
    close (DATA);
    return \%data_hash;
}

sub dump_hash {
     my @args = @_;
     my %hash =%{$args[0]};
     while (my ($key,$value) = each(%hash)){
	print $key."  ".$value."\n";
    }
 }
sub titrate {
    my @args = @_;
    my $hash_ref = $args[0];
    my %aa_hash = %$hash_ref;
    my $start_ph = MIN_PH;
    my $final_ph = MAX_PH;
    my $ph_grid = GRID;
    my %output_hash = ( );
    #go through the grid, titrate each residue
    for (my $ph = $start_ph; $ph <= $final_ph; $ph = $ph + $ph_grid){
	my $charge_hash_ref = &henderson_hasselbach($ph);
	my $net_charge = &net_charge($charge_hash_ref,$hash_ref);
	$output_hash{$ph} = $net_charge;
    }
    &dump_hash(\%output_hash);
}

sub henderson_hasselbach {
    #returns partial charges on all titratable groups
   my @args = @_;
   my $ph = $args[0];
   my $d_power = 10 ** ($ph - D_PKA);
   my $d_charge = - ($d_power)/(1 + $d_power);
   my $e_power = 10 ** ($ph - E_PKA);
   my $e_charge = - ($e_power)/(1 + $e_power);
   my $k_power = 10 ** ($ph - K_PKA);
   #NB the acid species is charged for bases
   my $k_charge = 1 - ($k_power)/(1 + $k_power);
   my $r_power = 10 ** ($ph - R_PKA);
   my $r_charge = 1 - ($r_power)/(1 + $r_power);
   my $c_power = 10 ** ($ph - C_PKA);
   my $c_charge = - ($c_power)/(1 + $c_power);
   my $y_power = 10 ** ($ph - Y_PKA);
   my $y_charge = - ($y_power)/(1 + $y_power);
   my $nt_power = 10 ** ($ph - AMINO_PKA);
   my $nt_charge = 1 - ($nt_power)/(1 + $nt_power);
   my $ct_power = 10 ** ($ph - CARBOXY_PKA);
   my $ct_charge = - ($ct_power)/(1 + $ct_power);
   my %charge_hash = (
		      D => $d_charge,
		      E => $e_charge,
		      K => $k_charge,
		      R => $r_charge,
		      C => $c_charge,
		      Y => $y_charge,
		      NT => $nt_charge,
		      CT => $ct_charge,
		     );
   return \%charge_hash;
}
sub net_charge {
    #does basic arithmetic (bit cumbersome?)
    # takes partial charges, multiplies by number of aa's
    # then adds them up.
   my @args = @_;
   my $charge_ref = $args[0];
   my $aa_count_ref = $args[1];
   my %aa_charge = %$charge_ref;
   my %aa_count = %$aa_count_ref;
   my $charge_d = $aa_charge{D} * $aa_count{D};
   my $charge_e = $aa_charge{E} * $aa_count{E};
   my $charge_k = $aa_charge{K} * $aa_count{K};
   my $charge_r = $aa_charge{R} * $aa_count{R};
   my $charge_c = $aa_charge{C} * $aa_count{C};
   my $charge_y = $aa_charge{Y} * $aa_count{Y};
   my $charge_nt = $aa_charge{D} * 1;
   my $charge_ct = $aa_charge{D} * 1;
   my $net_charge = ($charge_d + $charge_e + 
		     $charge_k + $charge_r + 
		     $charge_c + $charge_y +
		     $charge_nt + $charge_ct );
   
   return $net_charge;
}
 
sub main {
    my @args = @ARGV;
    my $filename = $args[0];
    my $hash_ref = &parce_fasta($filename);
    my $partial_charges_ref = &titrate($hash_ref);
}

main();

exit;


