Hi, I have a question regarding the following script.


#!/usr/sbin/perl
if (!@ARGV) {
    print STDERR "usage: $0 alignment_file [threshold%]...\n";
    print_sets();
    exit 0;
}
my $FILE       = shift @ARGV;
my @THRESHOLD;
if (@ARGV) {
    @THRESHOLD = @ARGV;
} else {
    @THRESHOLD = (90, 80, 70, 60, 50);
}
my @ID;
my @ALIGNMENT;

read_alignment($FILE);
$n=@ID;
for ($i=0; $i<$n; $i++){
printf "%-15s %s\n", $ID[$i], join("", @{$ALIGNMENT{$ID[$i]}});
}
for ($i=0; $i<$n; $i++){
print "$ID[$i]\n";
}

sub read_alignment {
    my ($file) = @_;
    my ($id, $line, %alignment);
    local (*TMP);

    open(TMP, "< $file") or die "can't open file '$file'\n";
    while ($line = <TMP>) {
        next    if $line =~ /^CLUSTAL/;

        if ($line =~ /^([^      ]+)\s+([-a-zA-Z*.]+) *$/) {

       if (! $alignment{$1}) {

               # new sequence identifier
                push (@ID, $1);

          }


            #strip spaces,tabs,newlines: extend alignment array
            $line = $2;
            $line =~ tr/ \t\n//d;
            push (@{$ALIGNMENT{$1}}, split("", $line));

        }

    }

close TMP;
}


Heres is the question.
This script should   take an alignment with sequences spread in two or
more blocks, and print them in one single block. Se below

ClustalX

YPK1
SQLSWKRLLMKGYIPPYKPAVS-----NSMDTSNFDEEFTR---EKPIDSVVDEYLSESV
YPK2
KDISWKKLLLKGYIPPYKPIVK-----SEIDTANFDQEFTK---EKPIDSVVDEYLSASI
KPCA_HUMAN
RRIDWEKLENREIQPPFKPKVC------GKGAENFDKFFTR---GQPVLTPPDQLVIANI
KPCZ_HUMAN
RSIDWDLLEKKQALPPFQPQIT-----DDYGLDNFDTQFTS---EPVQLTPDDEDAIKRI
KAPA
KEVVWEKLLSRNIETPYEPPIQ----QGQGDTSQFDKYPE----EDINYGVQGEDPYADL
KAPC
NEVIWEKLLARYIETPYEPPIQ----QGQGDTSQFDRYPE----EEFNYGIQGEDPYMDL
KAPB
SEVVWERLLAKDIETPYEPPIT----SGIGDTSLFDQYPE----EQLDYGIQGDDPYAEY
KS6_HUMAN
RHINWEELLARKVEPPFKPLLQ-----SEEDVSQFDSKFTR---QTPVDSP-DDSTLSES
KPC1
RNINFDDILNLRVKPPYIPEIK-----SPEDTSYFEQEFTS---APPTLTPLPSVLTTSQ
KRAC_BOVIN
ASIVWQDVYEKKLSPPFKPQVT-----SETDTRYFDEEFTA---QMITITPPDQDDSMEG
SCH9
ADIDWEALKQKKIPPPFKPHLV-----SETDTSNFDPEFTT---ASTSYMNKHQPMMTAT
KGP1_DROME
LGFDWDGLASQLLIPPFVRPIA-----HPTDVRYFDRFPC------DLNEPPDELSGWDA
ARK2_RAT
KGIDWQYVYLRKYPPPLIPPRGEVNAADAFDIGSFDEEDTKG--IKLLDCDQDLYKNFPL
DBFB
AEINFETLRTS--SPPFIPQLD-----DETDAGYFDDFTNEEDMAKYADVFKRQNKLSAM
DBF2
ADINFSTLRSM--IPPFTPQLD-----SETDAGYFDDFTSEADMAKYADVFKRQDKLTAM
                               *                  *.

YPK1            ------QKQF
YPK2            ------QKQF
KPCA_HUMAN      D-----QSDF
KPCZ_HUMAN      D-----QSEF
KAPA            ------FRDF
KAPC            ------MKEF
KAPB            ------FQDF
KS6_HUMAN       A-----NQVF
KPC1            ------QEEF
KRAC_BOVIN      VDS-ERRPHF
SCH9            PLSPAMQAKF
KGP1_DROME      --------DF
ARK2_RAT        MISERWQQEV
DBFB            VDDSAVDSKL
DBF2            VDDSAVSSKL

This  should be the output.

YPK1
SQLSWKRLLMKGYIPPYKPAVS-----NSMDTSNFDEEFTR---EKPIDSVVDEYLSESV------QKQF
YPK2
KDISWKKLLLKGYIPPYKPIVK-----SEIDTANFDQEFTK---EKPIDSVVDEYLSASI------QKQF
KPCA_HUMAN
RRIDWEKLENREIQPPFKPKVC------GKGAENFDKFFTR---GQPVLTPPDQLVIANID-----QSDF
KPCZ_HUMAN
RSIDWDLLEKKQALPPFQPQIT-----DDYGLDNFDTQFTS---EPVQLTPDDEDAIKRID-----QSEF
KAPA
KEVVWEKLLSRNIETPYEPPIQ----QGQGDTSQFDKYPE----EDINYGVQGEDPYADL------FRDF
KAPC
NEVIWEKLLARYIETPYEPPIQ----QGQGDTSQFDRYPE----EEFNYGIQGEDPYMDL------MKEF
KAPB
SEVVWERLLAKDIETPYEPPIT----SGIGDTSLFDQYPE----EQLDYGIQGDDPYAEY------FQDF
KS6_HUMAN
RHINWEELLARKVEPPFKPLLQ-----SEEDVSQFDSKFTR---QTPVDSP-DDSTLSESA-----NQVF
KPC1
RNINFDDILNLRVKPPYIPEIK-----SPEDTSYFEQEFTS---APPTLTPLPSVLTTSQ------QEEF
KRAC_BOVIN
ASIVWQDVYEKKLSPPFKPQVT-----SETDTRYFDEEFTA---QMITITPPDQDDSMEGVDS-ERRPHF
SCH9
ADIDWEALKQKKIPPPFKPHLV-----SETDTSNFDPEFTT---ASTSYMNKHQPMMTATPLSPAMQAKF
KGP1_DROME
LGFDWDGLASQLLIPPFVRPIA-----HPTDVRYFDRFPC------DLNEPPDELSGWDA--------DF
ARK2_RAT
KGIDWQYVYLRKYPPPLIPPRGEVNAADAFDIGSFDEEDTKG--IKLLDCDQDLYKNFPLMISERWQQEV
DBFB
AEINFETLRTS--SPPFIPQLD-----DETDAGYFDDFTNEEDMAKYADVFKRQNKLSAMVDDSAVDSKL
DBF2
ADINFSTLRSM--IPPFTPQLD-----SETDAGYFDDFTSEADMAKYADVFKRQDKLTAMVDDSAVSSKL


However, the output is the following:


YPK1
SQLSWKRLLMKGYIPPYKPAVS-----NSMDTSNFDEEFTR---EKPIDSVVDEYLSESV------QKQF
YPK2
KDISWKKLLLKGYIPPYKPIVK-----SEIDTANFDQEFTK---EKPIDSVVDEYLSASI------QKQF
KPCA_HUMAN
RRIDWEKLENREIQPPFKPKVC------GKGAENFDKFFTR---GQPVLTPPDQLVIANID-----QSDF
KPCZ_HUMAN
RSIDWDLLEKKQALPPFQPQIT-----DDYGLDNFDTQFTS---EPVQLTPDDEDAIKRID-----QSEF
KAPA
KEVVWEKLLSRNIETPYEPPIQ----QGQGDTSQFDKYPE----EDINYGVQGEDPYADL------FRDF
KAPC
NEVIWEKLLARYIETPYEPPIQ----QGQGDTSQFDRYPE----EEFNYGIQGEDPYMDL------MKEF
KAPB
SEVVWERLLAKDIETPYEPPIT----SGIGDTSLFDQYPE----EQLDYGIQGDDPYAEY------FQDF
KS6_HUMAN
RHINWEELLARKVEPPFKPLLQ-----SEEDVSQFDSKFTR---QTPVDSP-DDSTLSESA-----NQVF
KPC1
RNINFDDILNLRVKPPYIPEIK-----SPEDTSYFEQEFTS---APPTLTPLPSVLTTSQ------QEEF
KRAC_BOVIN
ASIVWQDVYEKKLSPPFKPQVT-----SETDTRYFDEEFTA---QMITITPPDQDDSMEGVDS-ERRPHF
SCH9
ADIDWEALKQKKIPPPFKPHLV-----SETDTSNFDPEFTT---ASTSYMNKHQPMMTATPLSPAMQAKF
KGP1_DROME
LGFDWDGLASQLLIPPFVRPIA-----HPTDVRYFDRFPC------DLNEPPDELSGWDA--------DF
ARK2_RAT
KGIDWQYVYLRKYPPPLIPPRGEVNAADAFDIGSFDEEDTKG--IKLLDCDQDLYKNFPLMISERWQQEV
DBFB
AEINFETLRTS--SPPFIPQLD-----DETDAGYFDDFTNEEDMAKYADVFKRQNKLSAMVDDSAVDSKL
DBF2
ADINFSTLRSM--IPPFTPQLD-----SETDAGYFDDFTSEADMAKYADVFKRQDKLTAMVDDSAVSSKL
YPK1
SQLSWKRLLMKGYIPPYKPAVS-----NSMDTSNFDEEFTR---EKPIDSVVDEYLSESV------QKQF
YPK2
KDISWKKLLLKGYIPPYKPIVK-----SEIDTANFDQEFTK---EKPIDSVVDEYLSASI------QKQF
KPCA_HUMAN
RRIDWEKLENREIQPPFKPKVC------GKGAENFDKFFTR---GQPVLTPPDQLVIANID-----QSDF
KPCZ_HUMAN
RSIDWDLLEKKQALPPFQPQIT-----DDYGLDNFDTQFTS---EPVQLTPDDEDAIKRID-----QSEF
KAPA
KEVVWEKLLSRNIETPYEPPIQ----QGQGDTSQFDKYPE----EDINYGVQGEDPYADL------FRDF
KAPC
NEVIWEKLLARYIETPYEPPIQ----QGQGDTSQFDRYPE----EEFNYGIQGEDPYMDL------MKEF
KAPB
SEVVWERLLAKDIETPYEPPIT----SGIGDTSLFDQYPE----EQLDYGIQGDDPYAEY------FQDF
KS6_HUMAN
RHINWEELLARKVEPPFKPLLQ-----SEEDVSQFDSKFTR---QTPVDSP-DDSTLSESA-----NQVF
KPC1
RNINFDDILNLRVKPPYIPEIK-----SPEDTSYFEQEFTS---APPTLTPLPSVLTTSQ------QEEF
KRAC_BOVIN
ASIVWQDVYEKKLSPPFKPQVT-----SETDTRYFDEEFTA---QMITITPPDQDDSMEGVDS-ERRPHF
SCH9
ADIDWEALKQKKIPPPFKPHLV-----SETDTSNFDPEFTT---ASTSYMNKHQPMMTATPLSPAMQAKF
KGP1_DROME
LGFDWDGLASQLLIPPFVRPIA-----HPTDVRYFDRFPC------DLNEPPDELSGWDA--------DF
ARK2_RAT
KGIDWQYVYLRKYPPPLIPPRGEVNAADAFDIGSFDEEDTKG--IKLLDCDQDLYKNFPLMISERWQQEV
DBFB
AEINFETLRTS--SPPFIPQLD-----DETDAGYFDDFTNEEDMAKYADVFKRQNKLSAMVDDSAVDSKL
DBF2
ADINFSTLRSM--IPPFTPQLD-----SETDAGYFDDFTSEADMAKYADVFKRQDKLTAMVDDSAVSSKL



I have tried to solve the problem but I have not found an elegant
solution, and any suggestion will be welcomed
Cheers




Pedro Reche

--
***************************************************************************
PEDRO a. RECHE gallardo, pHD            TL: 617 632 3824
Scientist, Mol.Immnunol.Foundation,     FX: 617 632 3351
Dana-Farber Cancer Institute,           EM: [EMAIL PROTECTED]
Harvard Medical School,                 URL: http://www.reche.org
44 Binney Street, D610C,
Boston, MA 02115
***************************************************************************


Reply via email to