Created
November 29, 2012 16:55
-
-
Save radaniba/4170359 to your computer and use it in GitHub Desktop.
perl script to parse blast output
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # Variables: | |
| # $NQuery - the number of query sequences | |
| # $QueryHeader{$i} - the header line for query $i | |
| # $QueryLength{$i} - the length of the query | |
| # $Database{$i} - the database searched | |
| # $DbSequences{$i} - the number of sequences in the database | |
| # $DbLength{$i} - the number of residues in the database | |
| # $Lambda{$i} - lambda factor | |
| # $Kterm{$i} - K term | |
| # $Information{$i} - expected information content of the alignment | |
| # | |
| # $NHit{$i} - the number of hits for query $i | |
| # $TargetHeader{$i,$j} - the header line of the hit $j with query $i | |
| # | |
| # $NHsp{$i,$j} - the number of HSPs for hit $j or query $i | |
| # $HspScore{$i,$j,$k} - the score of HSP $k for target $j of query $i | |
| # $HspLength{$i,$j,$k} - the length of the HSP | |
| # $HspIdent{$i,$j,$k} - the number of identities in the HSP | |
| # $HspPositive{$i,$j,$k} - the number of positives of the HSP | |
| # $HspQuerySeq{$i,$j,$k} - the query sequence of the HSP | |
| # $HspTargetSeq{$i,$j,$k} - the target sequence of the HSP | |
| # $HspQueryBegin{$i,$j,$k} - beginning of the query sequence of the HSP | |
| # $HspTargetBegin{$i,$j,$k} - beginning of the target sequence of the HSP | |
| # $HspQueryEnd{$i,$j,$k} - end of the query sequence of the HSP | |
| # $HspTargetEnd{$i,$j,$k} - end of the target sequence of the HSP | |
| # | |
| # Output formats | |
| # | |
| # verbose: | |
| # >query hits: nhit lambda: lambda K: K len: qlength dblen: dblength | |
| # :target nhsp: nhsp len: tlength lambda: lambda K: K | |
| # score qbeg qend tbeg tend length | |
| # | |
| # compact: | |
| # >query nhit lambda K qlength dblength | |
| # :target nhsp tlength lambda K | |
| # score qbeg qend tbeg tend length | |
| # | |
| # hsp: | |
| # query target score qbegin qend tbegin tend length identities positive lambda K pvalue | |
| # | |
| # cdrom: | |
| # query target score qbegin tbegin length lambda K pvalue | |
| # | |
| # min: | |
| # query target score qbegin qend tbegin tend length pvalue | |
| # | |
| # hspseq: | |
| # query target score qbegin qend tbegin tend length identities positive lambda K query-seq target-seq pvalue | |
| # | |
| # | |
| #!/usr/bin/perl | |
| # perl script to parse blast output | |
| # | |
| # Variables: | |
| # $NQuery - the number of query sequences | |
| # $QueryHeader{$i} - the header line for query $i | |
| # $QueryLength{$i} - the length of the query | |
| # $Database{$i} - the database searched | |
| # $DbSequences{$i} - the number of sequences in the database | |
| # $DbLength{$i} - the number of residues in the database | |
| # $Lambda{$i} - lambda factor | |
| # $Kterm{$i} - K term | |
| # $Information{$i} - expected information content of the alignment | |
| # | |
| # $NHit{$i} - the number of hits for query $i | |
| # $TargetHeader{$i,$j} - the header line of the hit $j with query $i | |
| # | |
| # $NHsp{$i,$j} - the number of HSPs for hit $j or query $i | |
| # $HspScore{$i,$j,$k} - the score of HSP $k for target $j of query $i | |
| # $HspLength{$i,$j,$k} - the length of the HSP | |
| # $HspIdent{$i,$j,$k} - the number of identities in the HSP | |
| # $HspPositive{$i,$j,$k} - the number of positives of the HSP | |
| # $HspQuerySeq{$i,$j,$k} - the query sequence of the HSP | |
| # $HspTargetSeq{$i,$j,$k} - the target sequence of the HSP | |
| # $HspQueryBegin{$i,$j,$k} - beginning of the query sequence of the HSP | |
| # $HspTargetBegin{$i,$j,$k} - beginning of the target sequence of the HSP | |
| # $HspQueryEnd{$i,$j,$k} - end of the query sequence of the HSP | |
| # $HspTargetEnd{$i,$j,$k} - end of the target sequence of the HSP | |
| # | |
| # Output formats | |
| # | |
| # verbose: | |
| # >query hits: nhit lambda: lambda K: K len: qlength dblen: dblength | |
| # :target nhsp: nhsp len: tlength lambda: lambda K: K | |
| # score qbeg qend tbeg tend length | |
| # | |
| # compact: | |
| # >query nhit lambda K qlength dblength | |
| # :target nhsp tlength lambda K | |
| # score qbeg qend tbeg tend length | |
| # | |
| # hsp: | |
| # query target score qbegin qend tbegin tend length identities positive lambda K pvalue | |
| # | |
| # cdrom: | |
| # query target score qbegin tbegin length lambda K pvalue | |
| # | |
| # min: | |
| # query target score qbegin qend tbegin tend length pvalue | |
| # | |
| # hspseq: | |
| # query target score qbegin qend tbegin tend length identities positive lambda K query-seq target-seq pvalue | |
| # | |
| # | |
| ($file,$mode) = @ARGV; | |
| $file = '-' unless $file; | |
| $mode = '' unless $mode; | |
| $|=1; | |
| do ReadBlastOutput($file); | |
| exit; | |
| sub PrintTarget { | |
| # print all the hsps corresponding to last target | |
| my($i)=0; | |
| ($qlocus) = split(' ',$QueryHeader{$i}); | |
| if ($mode eq 'verbose') { | |
| print "\n#>$QueryHeader{$i}\n"; | |
| printf(">%s hits: %d lambda: %6.3f K: %6.3f len: %d dblen: %d\n", | |
| $qlocus,$NHit{$i},$Lambda{$i},$Kterm{$i}, | |
| $QueryLength{$i},$DbLength{$i}); | |
| } elsif($mode eq 'compact') { | |
| printf(">%s %d %5.3f %5.3f %d %d\n", | |
| $qlocus,$NHit{$i},$Lambda{$i},$Kterm{$i}, | |
| $QueryLength{$i},$DbLength{$i}); | |
| } | |
| $lambda = $Lambda{$i}; | |
| $kterm = $Kterm{$i}; | |
| for ($j=1; $j<=$NHit{$i}; $j++) { #SRIDHAR - to get just one hit | |
| #for ($j=1; $j<=1; $j++) { | |
| #printf "$i $j $qlocus %d\n",$NHsp{$i, $j}; #SRIDHAR | |
| ($tlocus) = split(' ',$TargetHeader{$i,$j}); | |
| # if (int($tlocus) > int($qlocus)) { next; } | |
| if ($mode eq 'verbose') { | |
| print "#:$TargetHeader{$i,$j}\n"; | |
| printf(":%s hsps: %d len: %d lambda: %5.3f K: %5.3f H: %5.3f\n", | |
| $tlocus, | |
| $NHsp{$i,$j}, | |
| $TargetLength{$i,$j}, | |
| $TargetLambda{$i,$j}, | |
| $TargetK{$i,$j}, | |
| $TargetH{$i,$j} ); | |
| } elsif($mode eq 'compact') { | |
| printf(":%s %d %d %5.3f %5.3f\n", | |
| $tlocus, | |
| $NHsp{$i,$j}, | |
| $TargetLength{$i,$j}, | |
| $TargetLambda{$i,$j}, | |
| $TargetK{$i,$j} ); | |
| } | |
| if ($TargetLambda{$i,$j}>0.0) { | |
| $lambda = $TargetLambda{$i,$j}; | |
| $kterm = $TargetK{$i,$j}; | |
| } | |
| for ($k=1; $k<=$NHsp{$i,$j}; $k++) { | |
| next if ($HspPval{$i,$j,$k}> 1e-10); | |
| if($mode eq 'cdrom') { | |
| printf("%s %s %d %d %d %d %5.3f %5.3f %g\n", | |
| $qlocus,$tlocus, | |
| $HspScore{$i,$j,$k}, | |
| $HspQueryBegin{$i,$j,$k}, | |
| $HspTargetBegin{$i,$j,$k}, | |
| $HspLength{$i,$j,$k}, | |
| $lambda,$kterm,$HspPval{$i,$j,$k}); | |
| } | |
| elsif($mode eq 'min') { | |
| ($tloc,$junk) = split(":",$tlocus); | |
| printf("%s\t%9s\t%d\t%d\t%d\t%d\t%d %g\n", | |
| substr($qlocus,3,6),$tloc, | |
| $HspScore{$i,$j,$k}, | |
| $HspQueryBegin{$i,$j,$k}, | |
| $HspQueryEnd{$i,$j,$k}, | |
| $HspTargetBegin{$i,$j,$k}, | |
| $HspTargetEnd{$i,$j,$k}, | |
| $HspLength{$i,$j,$k},$HspPval{$i,$j,$k}); | |
| } | |
| elsif ($mode eq 'verbose') { | |
| printf("score: %d qbeg: %d qend: %d tbeg: %d tend: %d len: %d id: %d pos: %d %g frame: %d\n", | |
| $HspScore{$i,$j,$k}, | |
| $HspQueryBegin{$i,$j,$k}, | |
| $HspQueryEnd{$i,$j,$k}, | |
| $HspTargetBegin{$i,$j,$k}, | |
| $HspTargetEnd{$i,$j,$k}, | |
| $HspLength{$i,$j,$k}, | |
| $HspIdent{$i,$j,$k}, | |
| $HspPositive{$i,$j,$k}, | |
| $HspFrame{$i,$j,$k}, | |
| $HspPval{$i,$j,$k} ); | |
| } elsif($mode eq 'compact') { | |
| printf("%d %d %d %d %d %d %g\n", | |
| $HspScore{$i,$j,$k}, | |
| $HspQueryBegin{$i,$j,$k}, | |
| $HspQueryEnd{$i,$j,$k}, | |
| $HspTargetBegin{$i,$j,$k}, | |
| $HspTargetEnd{$i,$j,$k}, | |
| $HspLength{$i,$j,$k},$HspPval{$i,$j,$k} ); | |
| } elsif($mode eq 'hspseq') { | |
| printf("%s %s %d %d %d %d %d %d %d %d %5.3f %5.3f %s %s %d %g\n", | |
| $qlocus,$tlocus, | |
| $HspScore{$i,$j,$k}, | |
| $HspQueryBegin{$i,$j,$k}, | |
| $HspQueryEnd{$i,$j,$k}, | |
| $HspTargetBegin{$i,$j,$k}, | |
| $HspTargetEnd{$i,$j,$k}, | |
| $HspLength{$i,$j,$k}, | |
| $HspIdent{$i,$j,$k}, | |
| $HspPositive{$i,$j,$k}, | |
| $HspFrame{$i,$j,$k}, | |
| $lambda,$kterm, | |
| $HspQuerySeq{$i,$j,$k}, | |
| $HspTargetSeq{$i,$j,$k},$HspPval{$i,$j,$k}); | |
| } elsif ($mode eq 'nolocus') { | |
| printf("%d %d %d %d %d %d %d %d %d %5.3f %5.3f %g\n", | |
| $HspScore{$i,$j,$k}, | |
| $HspQueryBegin{$i,$j,$k}, | |
| $HspQueryEnd{$i,$j,$k}, | |
| $HspTargetBegin{$i,$j,$k}, | |
| $HspTargetEnd{$i,$j,$k}, | |
| $HspLength{$i,$j,$k}, | |
| $HspIdent{$i,$j,$k}, | |
| $HspPositive{$i,$j,$k}, | |
| $HspFrame{$i,$j,$k}, | |
| $lambda,$kterm,$HspPval{$i,$j,$k}); | |
| } elsif ($mode eq 'bare') { | |
| printf("%d %d %d %d %d %d %g\n", | |
| $HspScore{$i,$j,$k}, | |
| $HspQueryBegin{$i,$j,$k}, | |
| $HspQueryEnd{$i,$j,$k}, | |
| $HspTargetBegin{$i,$j,$k}, | |
| $HspTargetEnd{$i,$j,$k}, | |
| $HspLength{$i,$j,$k},$HspPval{$i,$j,$k}); | |
| } elsif ($mode eq 'hsp2') { | |
| printf("%s\t%s\t%s\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%5.3f\t%5.3f\t%g\n", | |
| $qlocus,$tlocus, | |
| $TargetHeader{$i,$j}, | |
| $HspScore{$i,$j,$k}, | |
| $HspQueryBegin{$i,$j,$k}, | |
| $HspQueryEnd{$i,$j,$k}, | |
| $QueryLength{$i}, | |
| $HspTargetBegin{$i,$j,$k}, | |
| $HspTargetEnd{$i,$j,$k}, | |
| $TargetLength{$i,$j}, | |
| $HspLength{$i,$j,$k}, | |
| $HspIdent{$i,$j,$k}, | |
| $HspPositive{$i,$j,$k}, | |
| $HspFrame{$i,$j,$k}, | |
| $lambda,$kterm,$HspPval{$i,$j,$k}); | |
| } | |
| else { | |
| printf "%s %s %d %d %d %d %d %d %d %d %d %d %d %g %s\n", | |
| $qlocus,$tlocus, | |
| $HspScore{$i,$j,$k}, | |
| $HspQueryBegin{$i,$j,$k}, | |
| $HspQueryEnd{$i,$j,$k}, | |
| $HspTargetBegin{$i,$j,$k}, | |
| $HspTargetEnd{$i,$j,$k}, | |
| $HspLength{$i,$j,$k}, | |
| $HspIdent{$i,$j,$k}, | |
| $HspPositive{$i,$j,$k}, | |
| $HspFrame{$i,$j,$k},$QueryLength{$i}, | |
| $TargetLength{$i,$j},$HspPval{$i,$j,$k},$database | |
| # ,$TargetHeader{$i,$j} | |
| ; | |
| } | |
| #last; # SRIDHAR ALL HSPs for a target when commented else only the best one. | |
| } | |
| } | |
| } | |
| sub PrintOutputAndClear { | |
| do PrintOutput(); | |
| &clearAll; | |
| } | |
| sub clearAll{ | |
| undef $NQuery; | |
| $NQuery = 0; | |
| undef %nhsp; | |
| undef %QueryHeader; | |
| undef %QueryLength; | |
| undef %Database; | |
| undef %DbSequences; | |
| undef %DbLength; | |
| undef %Lambda; | |
| undef %Kterm; | |
| undef %Information; | |
| undef %NHit; | |
| undef %TargetHeader; | |
| undef %NHsp; | |
| undef %HspScore; | |
| undef %HspLength; | |
| undef %HspIdent; | |
| undef %HspPositive; | |
| undef %HspQuerySeq; | |
| undef %HspTargetSeq; | |
| undef %HspQueryBegin; | |
| undef %HspTargetBegin; | |
| undef %HspQueryEnd; | |
| undef %HspTargetEnd; | |
| } | |
| sub clearTarget{ | |
| undef $NQuery; | |
| $NQuery = 0; | |
| undef %NHit; | |
| undef %TargetHeader; | |
| undef %NHsp; | |
| undef %HspScore; | |
| undef %HspLength; | |
| undef %HspIdent; | |
| undef %HspPositive; | |
| undef %HspQuerySeq; | |
| undef %HspTargetSeq; | |
| undef %HspQueryBegin; | |
| undef %HspTargetBegin; | |
| undef %HspQueryEnd; | |
| undef %HspTargetEnd; | |
| } | |
| sub ReadBlastOutput { | |
| local($file) =@_; | |
| if (!$file || $file eq "") { | |
| open(input,"-"); | |
| } else { | |
| open(input,$file) || die("Unable to open BLAST input ".$file); | |
| } | |
| $NQuery = 0;$database=""; | |
| while (<input>) { | |
| chop; | |
| s/,/ /g; | |
| @t = split(' '); | |
| ($t1,$t2,$t3,$t4,$t5,$t6,$t7,$t8,$t9,$t10) = @t; | |
| if (!$t1) { next; } | |
| if (/^Database:\s+(\S+)/) {$database=$1;} | |
| if ($t1 eq "Query:") { | |
| if (!$HspQueryBegin{$NQuery,$NHit{$NQuery},$NHsp{$NQuery,$NHit{$NQuery}}} ) { | |
| $HspQueryBegin{$NQuery,$NHit{$NQuery},$NHsp{$NQuery,$NHit{$NQuery}}} = @t[1]; | |
| } | |
| $HspQuerySeq{$NQuery,$NHit{$NQuery},$NHsp{$NQuery,$NHit{$NQuery}}} .= @t[2]; | |
| $HspQueryEnd{$NQuery,$NHit{$NQuery},$NHsp{$NQuery,$NHit{$NQuery}}} = @t[3]; | |
| $_=<input>; last if (eof(input)); | |
| $_=<input>; last if (eof(input)); chop; @t=split(' '); | |
| if (@t[0] ne "Sbjct:") { | |
| print stderr "Parse error $_\n"; | |
| next; | |
| exit(200); | |
| } | |
| if (!$HspTargetBegin{$NQuery,$NHit{$NQuery},$NHsp{$NQuery,$NHit{$NQuery}}} ) { | |
| $HspTargetBegin{$NQuery,$NHit{$NQuery},$NHsp{$NQuery,$NHit{$NQuery}}} = @t[1]; | |
| } | |
| $HspTargetSeq{$NQuery,$NHit{$NQuery},$NHsp{$NQuery,$NHit{$NQuery}}} .= @t[2]; | |
| $HspTargetEnd{$NQuery,$NHit{$NQuery},$NHsp{$NQuery,$NHit{$NQuery}}} = @t[3]; | |
| next; | |
| } | |
| if ($t1 eq "Score") { | |
| $nhsp{$NQuery}++; | |
| $NHsp{$NQuery,$NHit{$NQuery}}++; | |
| $score = @t[2]; | |
| $score =~ s/\*//g; | |
| $HspScore{$NQuery,$NHit{$NQuery},$NHsp{$NQuery,$NHit{$NQuery}}} = $score; | |
| $HspPval{$NQuery,$NHit{$NQuery},$NHsp{$NQuery,$NHit{$NQuery}}} = @t[$#t]; | |
| # printf "$#t, Pval=%g\n", @t[$#t]; | |
| # Identities = 164/302 (54%), Positives = 230/302 (76%) | |
| $_=<input>; last if (eof(input)); chop; s/\// /g; | |
| @t=split(' '); | |
| $HspLength{$NQuery,$NHit{$NQuery},$NHsp{$NQuery,$NHit{$NQuery}}} = @t[3]; | |
| $HspIdent{$NQuery,$NHit{$NQuery},$NHsp{$NQuery,$NHit{$NQuery}}} = @t[2]; | |
| $HspPositive{$NQuery,$NHit{$NQuery},$NHsp{$NQuery,$NHit{$NQuery}}} = @t[7]; | |
| $HspFrame{$NQuery,$NHit{$NQuery},$NHsp{$NQuery,$NHit{$NQuery}}} = @t[$#t]; | |
| $HspQueryBegin{$NQuery,$NHit{$NQuery},$NHsp{$NQuery,$NHit{$NQuery}}} = 0; | |
| $HspTargetBegin{$NQuery,$NHit{$NQuery},$NHsp{$NQuery,$NHit{$NQuery}}} = 0; | |
| $HspQueryEnd{$NQuery,$NHit{$NQuery},$NHsp{$NQuery,$NHit{$NQuery}}} = 0; | |
| $HspTargetEnd{$NQuery,$NHit{$NQuery},$NHsp{$NQuery,$NHit{$NQuery}}} = 0; | |
| next; | |
| } | |
| if ($t1 eq "Plus" || $t1 eq "Minus") { | |
| next; | |
| } | |
| if ((substr($_,0,1) eq ">") && (substr($_,1,5) ne "Frame")) { | |
| &PrintTarget; | |
| &clearTarget; | |
| $NHit{$NQuery}++; | |
| $NHsp{$NQuery,$NHit{$NQuery}} = 0; | |
| $TargetHeader{$NQuery,$NHit{$NQuery}} = substr($_,1); | |
| $_=<input>; last if (eof(input)); chop; @t=split(' '); | |
| while(@t[0] ne "Length") { | |
| $TargetHeader{$NQuery,$NHit{$NQuery}} .= substr($_,8); | |
| $_=<input>; last if (eof(input)); chop; @t=split(' '); | |
| } | |
| $TargetLength{$NQuery,$NHit{$NQuery}} = @t[2]; | |
| $TargetLambda{$NQuery,$NHit{$NQuery}} = @t[3]; | |
| $TargetK{$NQuery,$NHit{$NQuery}} = @t[4]; | |
| $TargetH{$NQuery,$NHit{$NQuery}} = @t[5]; | |
| next; | |
| } | |
| if ($t1 eq "Database" && $t2 eq "=") { | |
| $DatabaseFile = $t3; | |
| next; | |
| } | |
| if ($t1 eq "Sequence" && $t2 eq "file") { | |
| $SequenceFile = $t4; | |
| next; | |
| } | |
| if ($t1 eq "DatabaseFile") { | |
| $DatabaseFile = $t2; | |
| next; | |
| } | |
| if ($t1 eq "SequenceFile") { | |
| $SequenceFile = $t2; | |
| next; | |
| } | |
| if ($t1 eq "Query=") { | |
| # do PrintOutputAndClear(); | |
| &PrintTarget; | |
| &clearAll; | |
| $NQuery=0; | |
| $QueryHeader{$NQuery} = substr($_,7); | |
| $_=<input>; last if (eof(input)); chop; | |
| while(substr($_,7,1) ne " ") { | |
| $QueryHeader{$NQuery} .= substr($_,7); | |
| $_=<input>; last if (eof(input)); chop; | |
| } | |
| s/\(/ /; | |
| @t = split(' '); | |
| $QueryLength{$NQuery} = @t[0]; | |
| $NHit{$NQuery} = 0; | |
| while (<input>){ | |
| chomp; # lose the echofilter output | |
| last if(!$_); | |
| } | |
| next; | |
| } | |
| if ($t1 eq "Lambda" && $t2 eq "=") { | |
| $Lambda{$NQuery} = @t[2]; | |
| next; | |
| } | |
| if ($t1 eq "K" && $t2 eq "=") { | |
| $Kterm{$NQuery} = @t[2]; | |
| $Information{$NQuery} = @t[5]; | |
| next; | |
| } | |
| if ($t1 eq "W" && $t2 eq "=") { | |
| $Wparam{$NQuery} = @t[2]; | |
| $Tparam{$NQuery} = @t[5]; | |
| $Xparam{$NQuery} = @t[10]; | |
| next; | |
| } | |
| if ($t1 eq "M" && $t2 eq "=") { | |
| $Matrix{$NQuery} = @t[2]; | |
| next; | |
| } | |
| if ($t1 eq "H" && $t2 eq "=") { | |
| $Hparam{$NQuery} = @t[2]; | |
| $Vparam{$NQuery} = @t[5]; | |
| $Bparam{$NQuery} = @t[10]; | |
| next; | |
| } | |
| if ($t1 eq "Database:") { | |
| $Database{$NQuery} = substr($_,11); | |
| $_=<input>; last if (eof(input)); | |
| s/,//; | |
| ($nq,$junk,$len) = split(' '); | |
| if ($nq eq '#') { next; } | |
| $DbSequences{$NQuery} = $nq; | |
| $DbLength{$NQuery} = $len; | |
| next; | |
| } | |
| if ($t1 eq '#') { | |
| if ($t3 eq 'residues' && $t5 eq 'query:') { | |
| $t6 =~ s/,//g; | |
| $QueryLength{$NQuery} = $t6; | |
| } | |
| if ($t3 eq 'residues' && $t5 eq 'database:') { | |
| $t6 =~ s/,//g; | |
| $DbLength{$NQuery} = $t6; | |
| } | |
| } | |
| } | |
| &PrintTarget; | |
| &clearAll; | |
| # do PrintOutputAndClear(); | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment