HI again, I have corrected myself a bit, I think the script is now giving me what I want, having said that, I guess it is not the best way ( even if there is more than one way), again any pointer are welcome. many thanks Nat #!/usr/bin/perl use strict; use warnings; my @seq; @seq=qw{^TGGCAGTGGAGG ^TGTCTGGCAGTG ^TG....GCAGTG TCTGTCTG TCTGGCAG GCAGTGGA TGTCTGGC ^TGTCTGGC ^..TCTGGCAGTG ^TGTCTGGCAGTG ^TGCATGGC};
open (IN,"</Users/nac/Desktop/example.fastq") or die "can't open in:$!"; open (OUT,">>//Users/nac/Desktop/example.fastq\_class_COUNTED2.txt") or die "can't open out: $!"; my %final_hash; while (<IN>) { if (/^A|T|G|C/){ print my $seq_line=$_; foreach my $ff (@seq){ if ($seq_line =~ /$ff/g){ if (!exists $final_hash{$ff}) { $final_hash{$ff}=1; } else { $final_hash {$ff}++; } } } } } for my $key (sort {$final_hash {$b} <=> $final_hash {$a}}keys %final_hash){ my $value=$final_hash{$key}; print OUT $key,"\t",$value, "\n"; } > HI, > I need your wisdom on this parsing script. I have a fastq file,this > contains info for reads ( from nextGen), 1 line starts with a @, second > contain the sequence info from which I want to count pattern, third line > with a sign, fourth with info about the sequence quality ( see attached > working example). > > I have created an array containing my patterns, > @sequences=qw{^TGGCAGTGGAGG ^TGTCTGGCAGTG ^TG....GCAGTG TCTGTCTG TCTGGCAG > GCAGTGGA TGTCTGGC ^TGTCTGGC ^..TCTGGCAGTG ^TGTCTGGCAGTG ^TGCATGGC}. Some > patterns have to be at the beginning of the sequence, some not. > I try to use the grep function to loop through a list in order to test if > the sequence match the elements from the list. then i use a hashe to count. > In the end I create an output file which contain the first line of the > fastq as keys and not at all the elements from my @sequences (class counted > attached). > I would appreciate any pointers on this, > many thanks > Nat > > > #!/usr/bin/perl > use strict; > use warnings; > > > my @sequences; > @sequences=qw{^TGGCAGTGGAGG ^TGTCTGGCAGTG ^TG....GCAGTG TCTGTCTG TCTGGCAG > GCAGTGGA TGTCTGGC ^TGTCTGGC ^..TCTGGCAGTG ^TGTCTGGCAGTG ^TGCATGGC}; > my %final_hash; > > while (<IN>) { > if (/^\@/){ > my $seq=<IN>; get the sequences > chomp; > if (grep {$seq } @sequences){# I want to test if $seq contain anything > that will match with any of the element from @sequences, this is where it > goes wrong I think. > if (!exists $final_hash{$_}) { > $final_hash{$_}=1; > } else { > $final_hash {$_}++; > } > } > } > } > for my $key (sort {$final_hash {$b} <=> $final_hash {$a}}keys > %final_hash){ > my $value=$final_hash{$key}; > print OUT $key,"\t",$value, "\n"; > } -- The Wellcome Trust Sanger Institute is operated by Genome Research Limited, a charity registered in England with number 1021457 and a company registered in England with number 2742969, whose registered office is 215 Euston Road, London, NW1 2BE.
@MF8V4:4:156 TGGCAGTGGAGGAAGTCTCTTTAAGAAAATAGTTTAAACAATTTGTTAAAAATTTTCCGTCTTATTTCATTTCTGTAACA GTTGATATCTGGCTGTCCA + :<:<<>;<=====>>==<:893583368-9<;:;1::35::;;6;;7;;99+7<<-;<::136999499::4:9189;;< 1::9;8851.0+...+2/2 @MF8V4:4:162 CTCTGGCAGTGGAGGAAGTCTCTTTAAGAAAATAGTTTAAACAATTTGTTAAAAAATTTTCCGTCTTATTTCATTTCTGT AACAGTTGATACTTCGTAGGCTGTCCA + 88888988885.21*205168888.833555+888.6.68-12+23*04444444&446,662/-4355606766136.4 3(+,,-*--+-(+&,+---),10++.1 @MF8V4:4:164 CTGGCAGTGGAGGAAGTCTCTTTAAGAAAATAGTTTAAACAATTTGTTAAAAAATTTTCCGTCTTATTTCATTTCTAGTT ACATATTGATATCTGTCTGTCA + 8889888887886848865565-545888,88868188056155+14+..000&233(6-+(,,,1,1-..11+22.+-+ ((+,(,&-..,,..,,131-0+ @MF8V4:4:170 TGGCAGTGGAGGAAGTCTCTTTAAGAAAATAGTTTAAACAATTTGTTAAAAATTTTCCGTTCTTAATTTCATTTCTGTAA CAGTTGATATCTGGCTGTCA + >>>>;;21,22;=;;<<9998/391034*3;9<<5<=;>>=<;265+-//0),<<,27,,(,5/2.76/58990998785 ::::<<;97==10.,85,,( @MF8V4:4:171 CTCTAGGCTGAGATATGAGGAAGTACTCTTTAAGAAAATAGTTTAAACAATTAT + 864/.+*++---,0,1,155,.,10/-+--',,,,,,&,/.,1(11+11.1211 @MF8V4:4:191 ATTCAAGATCTCTGCGAGAGTGGAAGATCTTCTTAAAGAAATAGTTTAACATTGTTAAATTTCAATTTCATTATTCTTTC TTGATTTGTGATACATTGATA + 668884886556/-+-,((+,0-0),(+,-*,,(0/),,,&(/--/&--/011-1(11+,0'-+'--&,,(/,1,0-/&( ,,(((,&+++/0/,+(-0,,, @MF8V4:4:197 CAAGATCTCTGGCAGTGAGGAAGTCCTTATAAGTAAGATTAAGATTTGTAAAACATTTTTGTAAAATTTTCCTATCATTT CTACTGAATTATATGTCTGCCA + 88958898888888,,)3823/351*0,,((+,+,((-/+-*+---'+0000&,,(,,,&(,(,,&(//&0,,(,--,-( cbi4a[nac]17: which perl /software/bin/perl cbi4a[nac]18: more test_33pbremoved.fatsq @MF8V4:4:156 TGGCAGTGGAGGAAGTCTCTTTAAGAAAATAGTTTAAACAATTTGTTAAAAATTTTCCGTCTTATTTCATTTCTGTAACA GTTGATATCTGGCTGTCCA + :<:<<>;<=====>>==<:893583368-9<;:;1::35::;;6;;7;;99+7<<-;<::136999499::4:9189;;< 1::9;8851.0+...+2/2 @MF8V4:4:162 CTCTGGCAGTGGAGGAAGTCTCTTTAAGAAAATAGTTTAAACAATTTGTTAAAAAATTTTCCGTCTTATTTCATTTCTGT AACAGTTGATACTTCGTAGGCTGTCCA + 88888988885.21*205168888.833555+888.6.68-12+23*04444444&446,662/-4355606766136.4 3(+,,-*--+-(+&,+---),10++.1 @MF8V4:4:164 CTGGCAGTGGAGGAAGTCTCTTTAAGAAAATAGTTTAAACAATTTGTTAAAAAATTTTCCGTCTTATTTCATTTCTAGTT ACATATTGATATCTGTCTGTCA + 8889888887886848865565-545888,88868188056155+14+..000&233(6-+(,,,1,1-..11+22.+-+ ((+,(,&-..,,..,,131-0+ @MF8V4:4:170 TGGCAGTGGAGGAAGTCTCTTTAAGAAAATAGTTTAAACAATTTGTTAAAAATTTTCCGTTCTTAATTTCATTTCTGTAA CAGTTGATATCTGGCTGTCA + >>>>;;21,22;=;;<<9998/391034*3;9<<5<=;>>=<;265+-//0),<<,27,,(,5/2.76/58990998785 ::::<<;97==10.,85,,( @MF8V4:4:171 CTCTAGGCTGAGATATGAGGAAGTACTCTTTAAGAAAATAGTTTAAACAATTAT + 864/.+*++---,0,1,155,.,10/-+--',,,,,,&,/.,1(11+11.1211 @MF8V4:4:191 ATTCAAGATCTCTGCGAGAGTGGAAGATCTTCTTAAAGAAATAGTTTAACATTGTTAAATTTCAATTTCATTATTCTTTC TTGATTTGTGATACATTGATA + 668884886556/-+-,((+,0-0),(+,-*,,(0/),,,&(/--/&--/011-1(11+,0'-+'--&,,(/,1,0-/&( ,,(((,&+++/0/,+(-0,,, @MF8V4:4:197 CAAGATCTCTGGCAGTGAGGAAGTCCTTATAAGTAAGATTAAGATTTGTAAAACATTTTTGTAAAATTTTCCTATCATTT CTACTGAATTATATGTCTGCCA + 88958898888888,,)3823/351*0,,((+,+,((-/+-*+---'+0000&,,(,,,&(,(,,&(//&0,,(,--,-( (,(,((1+1(1,,22((,,(&( @MF8V4:4:199 TGGCAGTGGAGGAAGTCTCTTTAAGAAAATAGTTTAAACAATTTGTTAAAAAATTTTCCGTCTTATTTCATTTCTAGTAA CATGTTGATATCTGGTCGTCCA + ======<;8988968996344,499;<<1<<<;;+11*,73783:9788889'999+9744.6*,(2-5552,30.0/+4 3265564561/++-(--+-+*/ @MF8V4:4:210 CAAGATCTCTGGCAGTGAGGAAGTCTCTTTAGGAAAATAGTTTAAACATTTGTTAAATTTCAATTTCATTATTTCTTGTT ACATTTCATATACGTTGATA + 8914399:::::98710,3,5/0444110&,(&,..&14444/66-21+0&-/./0),-&--+-0)/--.(,,&,-((,/ --++-&+(-(,(,(,,,((, @MF8V4:4:211 CAAGATCTCTGGCAGTGGAGGAAGTCTCTTTAAGAAAATAGTTTAAACAATTTGTTAAAAATTTTCCATCTTATTTCATT TCTGTAACAGTTGATATCTGGCTGTCCA + :;99<<<<<<<<<<<::69866;<<<<<<<;<<<<<<176588(//'+6,22*.5+0122&-55(6/50144567499:: 156054676073977675//)+00.0+3 @MF8V4:4:214 TCAAGATCTCTGGCAGTGGAGGAAGTCTCTTTAAGAAAATAGTTTAAACAATTTGTTAAAAATTTTCCGTCTTATTTCAT TTCTGGTAACATGTTGATATCTGGCTGTCA + 778746268888;898886889838666,,,&,4,+,,&,11-1/44/44544/4464444(665(500++10001-144 1-14,,1-/221113544400.0*+--(,( @MF8V4:4:215 CAAGATCTCTGGCAGTGGAGGAAGTCTCCTTAAGAAAATAGTTTCAACAATTTGTTAAAAATTTTCCATCATTATTTCAT TTCTGTTAATCAGTTGAGTATCTGCTGTCGA + ;;576;::;;;77::9:6;:::::8735(+(,22369+877770463843771-4-....&+--'+/22+,(),22-432 4.441+&1--23,63---343/.1,3,.0+0 @MF8V4:4:558 TGGCAGTGAAGAAGTCTCTTTAGAAAATAGTTTAAACATTGTTAAATTCAATTTCATTATTCTTATTACATTCTATACGT TGATA + >>>>::0,)(,(/1122355',(,77*89999065,1,(((/*//'+0//-/0(/+00.0/00&,(-//,(((++-/-,/ */-,- @MF8V4:4:580 TGGCAGCGGAGGAAGCTCTTAGAAAGAAATAGTTTAAACAGTTTTATAAATTTTCCGATCTTATTTCATTCTGTAACAGT TGATATCTGTCGTCCAC + >;;;=,,(776276,((31*--..&,-3,262.2122-20+---&+++0+.22)2+-0/--+/--(/0,,,2-2286628 776400(++(-+((*-- @MF8V4:4:602 TGGCAGTGGAGGAAGTCTCTTTAAGAAAATAGTTTAAACAATTTGTTAAAAATTTTCCGTCTTATTTCATTTCTGTAACA TGTTGATATCTGGCTGTCCA + >>>>>>>=:=>===9=;3+,.&,6669=08<<3;6;;79;7;;69::;<<<,<<</<:65080209518997:<195;91 /2;;;::40.++(+..+316 @MF8V4:4:605 TGGCAGCGGAGGAAGCCTCTTGAAGAAAATAGTTTAAACAGTTTTTTATAAGTTTTCCGTCTTATTTCACTTCTGTAACA TGTTGATATCTGGCGTTCA + <=====<<:<:<<<<9;9<:;7;;;<<</<8817*33+05345778'57748977*4-110304281699:89908:;:5 25:7/0,712..(((,&,( @MF8V4:4:607 TGGCAGTGGTGGAAGTCTCTTGAAGAAAATAGTTTAAACAATTTATTCAACATTTTCTGTCTTATTTCATTTCTGTAACA GTTGATATCTGGCT + >>>=>>>>>>>>>>>>>>>=<<=;;;<<3=;9=>.6835===<7;8;54//5568-99997998990878;3:;;;<<<= ===<===>;===9< @MF8V4:4:620 TGGCAGTGGAGGAAGTCTCCTTAAGAAAATAGTTTCAACTAATTTAGTTAAAAATTTTCCATCAATTATTTCATTTCTGA TAATCAGTTGTATATCTGACTGTCCGA + 9<::=============:<98-1--99;/:=:::6/8;657775055265222'555*6412,6267643*.+(,&(+-, ,(1221-1--00-/0.+,(----/&-/
GCAGTGGA 9 TCTGGCAG 8 ^TGGCAGTGGAGG 7 TCTGTCTG 2
-- To unsubscribe, e-mail: beginners-unsubscr...@perl.org For additional commands, e-mail: beginners-h...@perl.org http://learn.perl.org/