#!/usr/sbin/perl
if (!@ARGV) {
print STDERR "usage: $0 alignment_file
[threshold%]...\n";
print_sets();
exit 0;
}
my $FILE = shift
@ARGV;
my @THRESHOLD;
if (@ARGV) {
@THRESHOLD = @ARGV;
} else {
@THRESHOLD = (90, 80, 70, 60,
50);
}
my @ID;
my @ALIGNMENT;
read_alignment($FILE);
$n=@ID;
for ($i=0; $i<$n; $i++){
printf "%-15s %s\n", $ID[$i], join("", @{$ALIGNMENT{$ID[$i]}});
}
sub read_alignment {
my ($file) = @_;
my ($id, $line, %alignment);
local (*TMP);
open(TMP, "< $file") or die
"can't open file '$file'\n";
while ($line = <TMP>) {
next
if $line =~ /^CLUSTAL/;
if ($line =~ /^([^ ]+)\s+([-a-zA-Z*.]+) *$/) {
if (! $alignment{$1}) {
# new sequence identifier
push (@ID, $1);
}
#strip spaces,tabs,newlines: extend alignment array
$line = $2;
$line =~ tr/ \t\n//d;
push (@{$ALIGNMENT{$1}}, split("", $line));
}
}
close TMP;
}
If the program was working properly the subroutine read_alignment should be parsing the input file line by line and holding in the array @ID the identifier of the sequences only if the identifier has not been seen before (sequence identifiers are in the first column of the input file). However, if I print the elements of @ID, this will contain the same sequence identifier repeated as many times as it appeared in the input file. The statement "if (! $alignment{$1})" should have been taking care of this but, at least in my machine, it is not. The attached file contain an input file, so you can check what I am saying.Any help will be welcomed,
Pedro Reche
-- *************************************************************************** PEDRO a. RECHE gallardo, pHD TL: 617 632 3824 Scientist, Mol.Immnunol.Foundation, FX: 617 632 3351 Dana-Farber Cancer Institute, EM: [EMAIL PROTECTED] Harvard Medical School, URL: http://www.reche.org 44 Binney Street, D610C, Boston, MA 02115 ***************************************************************************
CLUSTAL W(1.60) multiple sequence alignment
YPK1 SQLSWKRLLMKGYIPPYKPAVS-----NSMDTSNFDEEFTR---EKPIDSVVDEYLSESV YPK2 KDISWKKLLLKGYIPPYKPIVK-----SEIDTANFDQEFTK---EKPIDSVVDEYLSASI KPCA_HUMAN RRIDWEKLENREIQPPFKPKVC------GKGAENFDKFFTR---GQPVLTPPDQLVIANI KPCZ_HUMAN RSIDWDLLEKKQALPPFQPQIT-----DDYGLDNFDTQFTS---EPVQLTPDDEDAIKRI KAPA KEVVWEKLLSRNIETPYEPPIQ----QGQGDTSQFDKYPE----EDINYGVQGEDPYADL KAPC NEVIWEKLLARYIETPYEPPIQ----QGQGDTSQFDRYPE----EEFNYGIQGEDPYMDL KAPB SEVVWERLLAKDIETPYEPPIT----SGIGDTSLFDQYPE----EQLDYGIQGDDPYAEY KS6_HUMAN RHINWEELLARKVEPPFKPLLQ-----SEEDVSQFDSKFTR---QTPVDSP-DDSTLSES KPC1 RNINFDDILNLRVKPPYIPEIK-----SPEDTSYFEQEFTS---APPTLTPLPSVLTTSQ KRAC_BOVIN ASIVWQDVYEKKLSPPFKPQVT-----SETDTRYFDEEFTA---QMITITPPDQDDSMEG SCH9 ADIDWEALKQKKIPPPFKPHLV-----SETDTSNFDPEFTT---ASTSYMNKHQPMMTAT KGP1_DROME LGFDWDGLASQLLIPPFVRPIA-----HPTDVRYFDRFPC------DLNEPPDELSGWDA ARK2_RAT KGIDWQYVYLRKYPPPLIPPRGEVNAADAFDIGSFDEEDTKG--IKLLDCDQDLYKNFPL DBFB AEINFETLRTS--SPPFIPQLD-----DETDAGYFDDFTNEEDMAKYADVFKRQNKLSAM DBF2 ADINFSTLRSM--IPPFTPQLD-----SETDAGYFDDFTSEADMAKYADVFKRQDKLTAM * *. YPK1 ------QKQF YPK2 ------QKQF KPCA_HUMAN D-----QSDF KPCZ_HUMAN D-----QSEF KAPA ------FRDF KAPC ------MKEF KAPB ------FQDF KS6_HUMAN A-----NQVF KPC1 ------QEEF KRAC_BOVIN VDS-ERRPHF SCH9 PLSPAMQAKF KGP1_DROME --------DF ARK2_RAT MISERWQQEV DBFB VDDSAVDSKL DBF2 VDDSAVSSKL