Here's the attachment for real this time.

#!/usr/bin/perl -w

# freqdiff - print frequency difference between two inputs
#
# Copyright (C) 2002  Daniel Quinlan
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of either the Artistic License or the GNU General
# Public License as published by the Free Software Foundation; either
# version 2 of the License, or (at your option) any later version.

use vars qw($opt_a $opt_d $opt_h $opt_p $opt_r);
use Getopt::Std;
getopts("adhpr");

my $prog = $0;
$prog =~ s@.*/@@;

sub usage {
    my $status = shift;

    my $out = $status ? STDERR : STDOUT;
    print $out <<EOF;
usage: $prog [options] [first input] [second input]
       $prog [options] [first input] [second input] [scores file]

 -a    show all elements (only used for -d)
 -d    print delta of frequencies (second - first)
 -h    print this help
 -p    print percentage of frequencies ((first / (first + second)) * 100.0)
 -r    use relative frequencies (account for size of inputs, useful for -p)

default is -d if input line counts are within 1% of each other and -p otherwise
EOF
    exit($status);
}

usage(0) if $opt_h;
usage(1) unless $#ARGV > 0;

my @line;
my $type;
my %one = read_argv(0);
my %two = read_argv(1);

$opt_d = 1 if (!$opt_p && (abs($line[0] - $line[1]) / $line[1]) < 0.01);
$opt_a = 1 if ($type == 3);

my %score;
if ($#ARGV > 1) {
    open(FILE, $ARGV[2]) || die "open failed: $ARGV[2]";
    while (<FILE>) {
        chomp;
        s/#.*//;
        my @field = split;
        if ($#field >= 2 && $field[0] eq "score") {
            $score{$field[1]} = $field[2];
        }
    }
    close(FILE);
}

my @all = (keys %one);
foreach my $elem (keys %two) {
    if (! defined($one{$elem})) {
        push(@all, $elem);
    }
}

my %out;
foreach my $elem (@all) {
    my $one = 0;
    my $two = 0;

    if (exists($one{$elem})) {
        $one = $one{$elem};
        delete $one{$elem};
    }
    if (exists($two{$elem})) {
        $two = $two{$elem};
        delete $two{$elem};
    }
    if ($opt_d) {
        $out{$elem} = $two - $one;
    }
    else {
        $out{$elem} = $one / ($one + $two) * 100.0;
    }
}

foreach my $elem (sort { $out{$b} <=> $out{$a} || $a cmp $b } keys %out) {
    my $name = $elem;
    if (%score) {
        if (exists($score{$elem})) {
            $name = "$score{$elem}\t$name";
        }
        else {
            $name = "1.0\t$name";
        }
    }
    if ($opt_d) {
        print "$out{$elem}\t$name\n" if ($out{$elem} || $opt_a);
    }
    else {
        printf "%.2f\t%s\n", $out{$elem}, $name;
    }
}

sub read_argv {
    my ($input) = @_;
    my %freq;
    my $last = 0;

    open(FILE, $ARGV[$input]) || die "open failed: $ARGV[$input]";
    my $line = 0;
    while(<FILE>) {
        $line++;
        $last = $type;
        # "sort | uniq -c" format
        if (/^\s*(\d+)\s+(.*)/) {
            $type = 1;
            $freq{$2} = $1;
        }
        # "mass-check" format
        elsif (/^[Y.]\s+-?\d+\s+\S+\s+(.*)/) {
            $type = 2;
            foreach (split(/,/, $1)) {
                $freq{$_}++;
            }
        }
        # line number is frequency
        else {
            $type = 3;
            chomp;
            $freq{$_} = $line;
        }
        if ($last && $last != $type) {
            die "$prog: inconsistent format in $ARGV[$input]\n";
        }
    }
    close(FILE);

    foreach my $key (keys %freq) {
        if ($type == 3) {
            $freq{$key} = $line - $freq{$key} + 1;
        }
        if ($opt_r) {
            $freq{$key} = $freq{$key} / $line;
        }
    }
    $line[$input] = $line;
    return %freq;
}

Reply via email to