Hi, I have a question regarding the following script.
#!/usr/sbin/perl
if (!@ARGV) {
print STDERR "usage: $0 alignment_file [threshold%]...\n";
print_sets();
exit 0;
}
my $FILE = shift @ARGV;
my @THRESHOLD;
if (@ARGV) {
@THRESHOLD = @ARGV;
} else {
@THRESHOLD = (90, 80, 70, 60, 50);
}
my @ID;
my @ALIGNMENT;
read_alignment($FILE);
$n=@ID;
for ($i=0; $i<$n; $i++){
printf "%-15s %s\n", $ID[$i], join("", @{$ALIGNMENT{$ID[$i]}});
}
for ($i=0; $i<$n; $i++){
print "$ID[$i]\n";
}
sub read_alignment {
my ($file) = @_;
my ($id, $line, %alignment);
local (*TMP);
open(TMP, "< $file") or die "can't open file '$file'\n";
while ($line = <TMP>) {
next if $line =~ /^CLUSTAL/;
if ($line =~ /^([^ ]+)\s+([-a-zA-Z*.]+) *$/) {
if (! $alignment{$1}) {
# new sequence identifier
push (@ID, $1);
}
#strip spaces,tabs,newlines: extend alignment array
$line = $2;
$line =~ tr/ \t\n//d;
push (@{$ALIGNMENT{$1}}, split("", $line));
}
}
close TMP;
}
Heres is the question.
This script should take an alignment with sequences spread in two or
more blocks, and print them in one single block. Se below
ClustalX
YPK1
SQLSWKRLLMKGYIPPYKPAVS-----NSMDTSNFDEEFTR---EKPIDSVVDEYLSESV
YPK2
KDISWKKLLLKGYIPPYKPIVK-----SEIDTANFDQEFTK---EKPIDSVVDEYLSASI
KPCA_HUMAN
RRIDWEKLENREIQPPFKPKVC------GKGAENFDKFFTR---GQPVLTPPDQLVIANI
KPCZ_HUMAN
RSIDWDLLEKKQALPPFQPQIT-----DDYGLDNFDTQFTS---EPVQLTPDDEDAIKRI
KAPA
KEVVWEKLLSRNIETPYEPPIQ----QGQGDTSQFDKYPE----EDINYGVQGEDPYADL
KAPC
NEVIWEKLLARYIETPYEPPIQ----QGQGDTSQFDRYPE----EEFNYGIQGEDPYMDL
KAPB
SEVVWERLLAKDIETPYEPPIT----SGIGDTSLFDQYPE----EQLDYGIQGDDPYAEY
KS6_HUMAN
RHINWEELLARKVEPPFKPLLQ-----SEEDVSQFDSKFTR---QTPVDSP-DDSTLSES
KPC1
RNINFDDILNLRVKPPYIPEIK-----SPEDTSYFEQEFTS---APPTLTPLPSVLTTSQ
KRAC_BOVIN
ASIVWQDVYEKKLSPPFKPQVT-----SETDTRYFDEEFTA---QMITITPPDQDDSMEG
SCH9
ADIDWEALKQKKIPPPFKPHLV-----SETDTSNFDPEFTT---ASTSYMNKHQPMMTAT
KGP1_DROME
LGFDWDGLASQLLIPPFVRPIA-----HPTDVRYFDRFPC------DLNEPPDELSGWDA
ARK2_RAT
KGIDWQYVYLRKYPPPLIPPRGEVNAADAFDIGSFDEEDTKG--IKLLDCDQDLYKNFPL
DBFB
AEINFETLRTS--SPPFIPQLD-----DETDAGYFDDFTNEEDMAKYADVFKRQNKLSAM
DBF2
ADINFSTLRSM--IPPFTPQLD-----SETDAGYFDDFTSEADMAKYADVFKRQDKLTAM
* *.
YPK1 ------QKQF
YPK2 ------QKQF
KPCA_HUMAN D-----QSDF
KPCZ_HUMAN D-----QSEF
KAPA ------FRDF
KAPC ------MKEF
KAPB ------FQDF
KS6_HUMAN A-----NQVF
KPC1 ------QEEF
KRAC_BOVIN VDS-ERRPHF
SCH9 PLSPAMQAKF
KGP1_DROME --------DF
ARK2_RAT MISERWQQEV
DBFB VDDSAVDSKL
DBF2 VDDSAVSSKL
This should be the output.
YPK1
SQLSWKRLLMKGYIPPYKPAVS-----NSMDTSNFDEEFTR---EKPIDSVVDEYLSESV------QKQF
YPK2
KDISWKKLLLKGYIPPYKPIVK-----SEIDTANFDQEFTK---EKPIDSVVDEYLSASI------QKQF
KPCA_HUMAN
RRIDWEKLENREIQPPFKPKVC------GKGAENFDKFFTR---GQPVLTPPDQLVIANID-----QSDF
KPCZ_HUMAN
RSIDWDLLEKKQALPPFQPQIT-----DDYGLDNFDTQFTS---EPVQLTPDDEDAIKRID-----QSEF
KAPA
KEVVWEKLLSRNIETPYEPPIQ----QGQGDTSQFDKYPE----EDINYGVQGEDPYADL------FRDF
KAPC
NEVIWEKLLARYIETPYEPPIQ----QGQGDTSQFDRYPE----EEFNYGIQGEDPYMDL------MKEF
KAPB
SEVVWERLLAKDIETPYEPPIT----SGIGDTSLFDQYPE----EQLDYGIQGDDPYAEY------FQDF
KS6_HUMAN
RHINWEELLARKVEPPFKPLLQ-----SEEDVSQFDSKFTR---QTPVDSP-DDSTLSESA-----NQVF
KPC1
RNINFDDILNLRVKPPYIPEIK-----SPEDTSYFEQEFTS---APPTLTPLPSVLTTSQ------QEEF
KRAC_BOVIN
ASIVWQDVYEKKLSPPFKPQVT-----SETDTRYFDEEFTA---QMITITPPDQDDSMEGVDS-ERRPHF
SCH9
ADIDWEALKQKKIPPPFKPHLV-----SETDTSNFDPEFTT---ASTSYMNKHQPMMTATPLSPAMQAKF
KGP1_DROME
LGFDWDGLASQLLIPPFVRPIA-----HPTDVRYFDRFPC------DLNEPPDELSGWDA--------DF
ARK2_RAT
KGIDWQYVYLRKYPPPLIPPRGEVNAADAFDIGSFDEEDTKG--IKLLDCDQDLYKNFPLMISERWQQEV
DBFB
AEINFETLRTS--SPPFIPQLD-----DETDAGYFDDFTNEEDMAKYADVFKRQNKLSAMVDDSAVDSKL
DBF2
ADINFSTLRSM--IPPFTPQLD-----SETDAGYFDDFTSEADMAKYADVFKRQDKLTAMVDDSAVSSKL
However, the output is the following:
YPK1
SQLSWKRLLMKGYIPPYKPAVS-----NSMDTSNFDEEFTR---EKPIDSVVDEYLSESV------QKQF
YPK2
KDISWKKLLLKGYIPPYKPIVK-----SEIDTANFDQEFTK---EKPIDSVVDEYLSASI------QKQF
KPCA_HUMAN
RRIDWEKLENREIQPPFKPKVC------GKGAENFDKFFTR---GQPVLTPPDQLVIANID-----QSDF
KPCZ_HUMAN
RSIDWDLLEKKQALPPFQPQIT-----DDYGLDNFDTQFTS---EPVQLTPDDEDAIKRID-----QSEF
KAPA
KEVVWEKLLSRNIETPYEPPIQ----QGQGDTSQFDKYPE----EDINYGVQGEDPYADL------FRDF
KAPC
NEVIWEKLLARYIETPYEPPIQ----QGQGDTSQFDRYPE----EEFNYGIQGEDPYMDL------MKEF
KAPB
SEVVWERLLAKDIETPYEPPIT----SGIGDTSLFDQYPE----EQLDYGIQGDDPYAEY------FQDF
KS6_HUMAN
RHINWEELLARKVEPPFKPLLQ-----SEEDVSQFDSKFTR---QTPVDSP-DDSTLSESA-----NQVF
KPC1
RNINFDDILNLRVKPPYIPEIK-----SPEDTSYFEQEFTS---APPTLTPLPSVLTTSQ------QEEF
KRAC_BOVIN
ASIVWQDVYEKKLSPPFKPQVT-----SETDTRYFDEEFTA---QMITITPPDQDDSMEGVDS-ERRPHF
SCH9
ADIDWEALKQKKIPPPFKPHLV-----SETDTSNFDPEFTT---ASTSYMNKHQPMMTATPLSPAMQAKF
KGP1_DROME
LGFDWDGLASQLLIPPFVRPIA-----HPTDVRYFDRFPC------DLNEPPDELSGWDA--------DF
ARK2_RAT
KGIDWQYVYLRKYPPPLIPPRGEVNAADAFDIGSFDEEDTKG--IKLLDCDQDLYKNFPLMISERWQQEV
DBFB
AEINFETLRTS--SPPFIPQLD-----DETDAGYFDDFTNEEDMAKYADVFKRQNKLSAMVDDSAVDSKL
DBF2
ADINFSTLRSM--IPPFTPQLD-----SETDAGYFDDFTSEADMAKYADVFKRQDKLTAMVDDSAVSSKL
YPK1
SQLSWKRLLMKGYIPPYKPAVS-----NSMDTSNFDEEFTR---EKPIDSVVDEYLSESV------QKQF
YPK2
KDISWKKLLLKGYIPPYKPIVK-----SEIDTANFDQEFTK---EKPIDSVVDEYLSASI------QKQF
KPCA_HUMAN
RRIDWEKLENREIQPPFKPKVC------GKGAENFDKFFTR---GQPVLTPPDQLVIANID-----QSDF
KPCZ_HUMAN
RSIDWDLLEKKQALPPFQPQIT-----DDYGLDNFDTQFTS---EPVQLTPDDEDAIKRID-----QSEF
KAPA
KEVVWEKLLSRNIETPYEPPIQ----QGQGDTSQFDKYPE----EDINYGVQGEDPYADL------FRDF
KAPC
NEVIWEKLLARYIETPYEPPIQ----QGQGDTSQFDRYPE----EEFNYGIQGEDPYMDL------MKEF
KAPB
SEVVWERLLAKDIETPYEPPIT----SGIGDTSLFDQYPE----EQLDYGIQGDDPYAEY------FQDF
KS6_HUMAN
RHINWEELLARKVEPPFKPLLQ-----SEEDVSQFDSKFTR---QTPVDSP-DDSTLSESA-----NQVF
KPC1
RNINFDDILNLRVKPPYIPEIK-----SPEDTSYFEQEFTS---APPTLTPLPSVLTTSQ------QEEF
KRAC_BOVIN
ASIVWQDVYEKKLSPPFKPQVT-----SETDTRYFDEEFTA---QMITITPPDQDDSMEGVDS-ERRPHF
SCH9
ADIDWEALKQKKIPPPFKPHLV-----SETDTSNFDPEFTT---ASTSYMNKHQPMMTATPLSPAMQAKF
KGP1_DROME
LGFDWDGLASQLLIPPFVRPIA-----HPTDVRYFDRFPC------DLNEPPDELSGWDA--------DF
ARK2_RAT
KGIDWQYVYLRKYPPPLIPPRGEVNAADAFDIGSFDEEDTKG--IKLLDCDQDLYKNFPLMISERWQQEV
DBFB
AEINFETLRTS--SPPFIPQLD-----DETDAGYFDDFTNEEDMAKYADVFKRQNKLSAMVDDSAVDSKL
DBF2
ADINFSTLRSM--IPPFTPQLD-----SETDAGYFDDFTSEADMAKYADVFKRQDKLTAMVDDSAVSSKL
I have tried to solve the problem but I have not found an elegant
solution, and any suggestion will be welcomed
Cheers
Pedro Reche
--
***************************************************************************
PEDRO a. RECHE gallardo, pHD TL: 617 632 3824
Scientist, Mol.Immnunol.Foundation, FX: 617 632 3351
Dana-Farber Cancer Institute, EM: [EMAIL PROTECTED]
Harvard Medical School, URL: http://www.reche.org
44 Binney Street, D610C,
Boston, MA 02115
***************************************************************************