Here's the attachment for real this time.
#!/usr/bin/perl -w
# freqdiff - print frequency difference between two inputs # # Copyright (C) 2002 Daniel Quinlan # # This program is free software; you can redistribute it and/or modify # it under the terms of either the Artistic License or the GNU General # Public License as published by the Free Software Foundation; either # version 2 of the License, or (at your option) any later version. use vars qw($opt_a $opt_d $opt_h $opt_p $opt_r); use Getopt::Std; getopts("adhpr"); my $prog = $0; $prog =~ s@.*/@@; sub usage { my $status = shift; my $out = $status ? STDERR : STDOUT; print $out <<EOF; usage: $prog [options] [first input] [second input] $prog [options] [first input] [second input] [scores file] -a show all elements (only used for -d) -d print delta of frequencies (second - first) -h print this help -p print percentage of frequencies ((first / (first + second)) * 100.0) -r use relative frequencies (account for size of inputs, useful for -p) default is -d if input line counts are within 1% of each other and -p otherwise EOF exit($status); } usage(0) if $opt_h; usage(1) unless $#ARGV > 0; my @line; my $type; my %one = read_argv(0); my %two = read_argv(1); $opt_d = 1 if (!$opt_p && (abs($line[0] - $line[1]) / $line[1]) < 0.01); $opt_a = 1 if ($type == 3); my %score; if ($#ARGV > 1) { open(FILE, $ARGV[2]) || die "open failed: $ARGV[2]"; while (<FILE>) { chomp; s/#.*//; my @field = split; if ($#field >= 2 && $field[0] eq "score") { $score{$field[1]} = $field[2]; } } close(FILE); } my @all = (keys %one); foreach my $elem (keys %two) { if (! defined($one{$elem})) { push(@all, $elem); } } my %out; foreach my $elem (@all) { my $one = 0; my $two = 0; if (exists($one{$elem})) { $one = $one{$elem}; delete $one{$elem}; } if (exists($two{$elem})) { $two = $two{$elem}; delete $two{$elem}; } if ($opt_d) { $out{$elem} = $two - $one; } else { $out{$elem} = $one / ($one + $two) * 100.0; } } foreach my $elem (sort { $out{$b} <=> $out{$a} || $a cmp $b } keys %out) { my $name = $elem; if (%score) { if (exists($score{$elem})) { $name = "$score{$elem}\t$name"; } else { $name = "1.0\t$name"; } } if ($opt_d) { print "$out{$elem}\t$name\n" if ($out{$elem} || $opt_a); } else { printf "%.2f\t%s\n", $out{$elem}, $name; } } sub read_argv { my ($input) = @_; my %freq; my $last = 0; open(FILE, $ARGV[$input]) || die "open failed: $ARGV[$input]"; my $line = 0; while(<FILE>) { $line++; $last = $type; # "sort | uniq -c" format if (/^\s*(\d+)\s+(.*)/) { $type = 1; $freq{$2} = $1; } # "mass-check" format elsif (/^[Y.]\s+-?\d+\s+\S+\s+(.*)/) { $type = 2; foreach (split(/,/, $1)) { $freq{$_}++; } } # line number is frequency else { $type = 3; chomp; $freq{$_} = $line; } if ($last && $last != $type) { die "$prog: inconsistent format in $ARGV[$input]\n"; } } close(FILE); foreach my $key (keys %freq) { if ($type == 3) { $freq{$key} = $line - $freq{$key} + 1; } if ($opt_r) { $freq{$key} = $freq{$key} / $line; } } $line[$input] = $line; return %freq; }