Skip to content

Instantly share code, notes, and snippets.

@radaniba
Created November 29, 2012 16:55
Show Gist options
  • Save radaniba/4170359 to your computer and use it in GitHub Desktop.
Save radaniba/4170359 to your computer and use it in GitHub Desktop.
perl script to parse blast output
# Variables:
# $NQuery - the number of query sequences
# $QueryHeader{$i} - the header line for query $i
# $QueryLength{$i} - the length of the query
# $Database{$i} - the database searched
# $DbSequences{$i} - the number of sequences in the database
# $DbLength{$i} - the number of residues in the database
# $Lambda{$i} - lambda factor
# $Kterm{$i} - K term
# $Information{$i} - expected information content of the alignment
#
# $NHit{$i} - the number of hits for query $i
# $TargetHeader{$i,$j} - the header line of the hit $j with query $i
#
# $NHsp{$i,$j} - the number of HSPs for hit $j or query $i
# $HspScore{$i,$j,$k} - the score of HSP $k for target $j of query $i
# $HspLength{$i,$j,$k} - the length of the HSP
# $HspIdent{$i,$j,$k} - the number of identities in the HSP
# $HspPositive{$i,$j,$k} - the number of positives of the HSP
# $HspQuerySeq{$i,$j,$k} - the query sequence of the HSP
# $HspTargetSeq{$i,$j,$k} - the target sequence of the HSP
# $HspQueryBegin{$i,$j,$k} - beginning of the query sequence of the HSP
# $HspTargetBegin{$i,$j,$k} - beginning of the target sequence of the HSP
# $HspQueryEnd{$i,$j,$k} - end of the query sequence of the HSP
# $HspTargetEnd{$i,$j,$k} - end of the target sequence of the HSP
#
# Output formats
#
# verbose:
# >query hits: nhit lambda: lambda K: K len: qlength dblen: dblength
# :target nhsp: nhsp len: tlength lambda: lambda K: K
# score qbeg qend tbeg tend length
#
# compact:
# >query nhit lambda K qlength dblength
# :target nhsp tlength lambda K
# score qbeg qend tbeg tend length
#
# hsp:
# query target score qbegin qend tbegin tend length identities positive lambda K pvalue
#
# cdrom:
# query target score qbegin tbegin length lambda K pvalue
#
# min:
# query target score qbegin qend tbegin tend length pvalue
#
# hspseq:
# query target score qbegin qend tbegin tend length identities positive lambda K query-seq target-seq pvalue
#
#
#!/usr/bin/perl
# perl script to parse blast output
#
# Variables:
# $NQuery - the number of query sequences
# $QueryHeader{$i} - the header line for query $i
# $QueryLength{$i} - the length of the query
# $Database{$i} - the database searched
# $DbSequences{$i} - the number of sequences in the database
# $DbLength{$i} - the number of residues in the database
# $Lambda{$i} - lambda factor
# $Kterm{$i} - K term
# $Information{$i} - expected information content of the alignment
#
# $NHit{$i} - the number of hits for query $i
# $TargetHeader{$i,$j} - the header line of the hit $j with query $i
#
# $NHsp{$i,$j} - the number of HSPs for hit $j or query $i
# $HspScore{$i,$j,$k} - the score of HSP $k for target $j of query $i
# $HspLength{$i,$j,$k} - the length of the HSP
# $HspIdent{$i,$j,$k} - the number of identities in the HSP
# $HspPositive{$i,$j,$k} - the number of positives of the HSP
# $HspQuerySeq{$i,$j,$k} - the query sequence of the HSP
# $HspTargetSeq{$i,$j,$k} - the target sequence of the HSP
# $HspQueryBegin{$i,$j,$k} - beginning of the query sequence of the HSP
# $HspTargetBegin{$i,$j,$k} - beginning of the target sequence of the HSP
# $HspQueryEnd{$i,$j,$k} - end of the query sequence of the HSP
# $HspTargetEnd{$i,$j,$k} - end of the target sequence of the HSP
#
# Output formats
#
# verbose:
# >query hits: nhit lambda: lambda K: K len: qlength dblen: dblength
# :target nhsp: nhsp len: tlength lambda: lambda K: K
# score qbeg qend tbeg tend length
#
# compact:
# >query nhit lambda K qlength dblength
# :target nhsp tlength lambda K
# score qbeg qend tbeg tend length
#
# hsp:
# query target score qbegin qend tbegin tend length identities positive lambda K pvalue
#
# cdrom:
# query target score qbegin tbegin length lambda K pvalue
#
# min:
# query target score qbegin qend tbegin tend length pvalue
#
# hspseq:
# query target score qbegin qend tbegin tend length identities positive lambda K query-seq target-seq pvalue
#
#
($file,$mode) = @ARGV;
$file = '-' unless $file;
$mode = '' unless $mode;
$|=1;
do ReadBlastOutput($file);
exit;
sub PrintTarget {
# print all the hsps corresponding to last target
my($i)=0;
($qlocus) = split(' ',$QueryHeader{$i});
if ($mode eq 'verbose') {
print "\n#>$QueryHeader{$i}\n";
printf(">%s hits: %d lambda: %6.3f K: %6.3f len: %d dblen: %d\n",
$qlocus,$NHit{$i},$Lambda{$i},$Kterm{$i},
$QueryLength{$i},$DbLength{$i});
} elsif($mode eq 'compact') {
printf(">%s %d %5.3f %5.3f %d %d\n",
$qlocus,$NHit{$i},$Lambda{$i},$Kterm{$i},
$QueryLength{$i},$DbLength{$i});
}
$lambda = $Lambda{$i};
$kterm = $Kterm{$i};
for ($j=1; $j<=$NHit{$i}; $j++) { #SRIDHAR - to get just one hit
#for ($j=1; $j<=1; $j++) {
#printf "$i $j $qlocus %d\n",$NHsp{$i, $j}; #SRIDHAR
($tlocus) = split(' ',$TargetHeader{$i,$j});
# if (int($tlocus) > int($qlocus)) { next; }
if ($mode eq 'verbose') {
print "#:$TargetHeader{$i,$j}\n";
printf(":%s hsps: %d len: %d lambda: %5.3f K: %5.3f H: %5.3f\n",
$tlocus,
$NHsp{$i,$j},
$TargetLength{$i,$j},
$TargetLambda{$i,$j},
$TargetK{$i,$j},
$TargetH{$i,$j} );
} elsif($mode eq 'compact') {
printf(":%s %d %d %5.3f %5.3f\n",
$tlocus,
$NHsp{$i,$j},
$TargetLength{$i,$j},
$TargetLambda{$i,$j},
$TargetK{$i,$j} );
}
if ($TargetLambda{$i,$j}>0.0) {
$lambda = $TargetLambda{$i,$j};
$kterm = $TargetK{$i,$j};
}
for ($k=1; $k<=$NHsp{$i,$j}; $k++) {
next if ($HspPval{$i,$j,$k}> 1e-10);
if($mode eq 'cdrom') {
printf("%s %s %d %d %d %d %5.3f %5.3f %g\n",
$qlocus,$tlocus,
$HspScore{$i,$j,$k},
$HspQueryBegin{$i,$j,$k},
$HspTargetBegin{$i,$j,$k},
$HspLength{$i,$j,$k},
$lambda,$kterm,$HspPval{$i,$j,$k});
}
elsif($mode eq 'min') {
($tloc,$junk) = split(":",$tlocus);
printf("%s\t%9s\t%d\t%d\t%d\t%d\t%d %g\n",
substr($qlocus,3,6),$tloc,
$HspScore{$i,$j,$k},
$HspQueryBegin{$i,$j,$k},
$HspQueryEnd{$i,$j,$k},
$HspTargetBegin{$i,$j,$k},
$HspTargetEnd{$i,$j,$k},
$HspLength{$i,$j,$k},$HspPval{$i,$j,$k});
}
elsif ($mode eq 'verbose') {
printf("score: %d qbeg: %d qend: %d tbeg: %d tend: %d len: %d id: %d pos: %d %g frame: %d\n",
$HspScore{$i,$j,$k},
$HspQueryBegin{$i,$j,$k},
$HspQueryEnd{$i,$j,$k},
$HspTargetBegin{$i,$j,$k},
$HspTargetEnd{$i,$j,$k},
$HspLength{$i,$j,$k},
$HspIdent{$i,$j,$k},
$HspPositive{$i,$j,$k},
$HspFrame{$i,$j,$k},
$HspPval{$i,$j,$k} );
} elsif($mode eq 'compact') {
printf("%d %d %d %d %d %d %g\n",
$HspScore{$i,$j,$k},
$HspQueryBegin{$i,$j,$k},
$HspQueryEnd{$i,$j,$k},
$HspTargetBegin{$i,$j,$k},
$HspTargetEnd{$i,$j,$k},
$HspLength{$i,$j,$k},$HspPval{$i,$j,$k} );
} elsif($mode eq 'hspseq') {
printf("%s %s %d %d %d %d %d %d %d %d %5.3f %5.3f %s %s %d %g\n",
$qlocus,$tlocus,
$HspScore{$i,$j,$k},
$HspQueryBegin{$i,$j,$k},
$HspQueryEnd{$i,$j,$k},
$HspTargetBegin{$i,$j,$k},
$HspTargetEnd{$i,$j,$k},
$HspLength{$i,$j,$k},
$HspIdent{$i,$j,$k},
$HspPositive{$i,$j,$k},
$HspFrame{$i,$j,$k},
$lambda,$kterm,
$HspQuerySeq{$i,$j,$k},
$HspTargetSeq{$i,$j,$k},$HspPval{$i,$j,$k});
} elsif ($mode eq 'nolocus') {
printf("%d %d %d %d %d %d %d %d %d %5.3f %5.3f %g\n",
$HspScore{$i,$j,$k},
$HspQueryBegin{$i,$j,$k},
$HspQueryEnd{$i,$j,$k},
$HspTargetBegin{$i,$j,$k},
$HspTargetEnd{$i,$j,$k},
$HspLength{$i,$j,$k},
$HspIdent{$i,$j,$k},
$HspPositive{$i,$j,$k},
$HspFrame{$i,$j,$k},
$lambda,$kterm,$HspPval{$i,$j,$k});
} elsif ($mode eq 'bare') {
printf("%d %d %d %d %d %d %g\n",
$HspScore{$i,$j,$k},
$HspQueryBegin{$i,$j,$k},
$HspQueryEnd{$i,$j,$k},
$HspTargetBegin{$i,$j,$k},
$HspTargetEnd{$i,$j,$k},
$HspLength{$i,$j,$k},$HspPval{$i,$j,$k});
} elsif ($mode eq 'hsp2') {
printf("%s\t%s\t%s\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%5.3f\t%5.3f\t%g\n",
$qlocus,$tlocus,
$TargetHeader{$i,$j},
$HspScore{$i,$j,$k},
$HspQueryBegin{$i,$j,$k},
$HspQueryEnd{$i,$j,$k},
$QueryLength{$i},
$HspTargetBegin{$i,$j,$k},
$HspTargetEnd{$i,$j,$k},
$TargetLength{$i,$j},
$HspLength{$i,$j,$k},
$HspIdent{$i,$j,$k},
$HspPositive{$i,$j,$k},
$HspFrame{$i,$j,$k},
$lambda,$kterm,$HspPval{$i,$j,$k});
}
else {
printf "%s %s %d %d %d %d %d %d %d %d %d %d %d %g %s\n",
$qlocus,$tlocus,
$HspScore{$i,$j,$k},
$HspQueryBegin{$i,$j,$k},
$HspQueryEnd{$i,$j,$k},
$HspTargetBegin{$i,$j,$k},
$HspTargetEnd{$i,$j,$k},
$HspLength{$i,$j,$k},
$HspIdent{$i,$j,$k},
$HspPositive{$i,$j,$k},
$HspFrame{$i,$j,$k},$QueryLength{$i},
$TargetLength{$i,$j},$HspPval{$i,$j,$k},$database
# ,$TargetHeader{$i,$j}
;
}
#last; # SRIDHAR ALL HSPs for a target when commented else only the best one.
}
}
}
sub PrintOutputAndClear {
do PrintOutput();
&clearAll;
}
sub clearAll{
undef $NQuery;
$NQuery = 0;
undef %nhsp;
undef %QueryHeader;
undef %QueryLength;
undef %Database;
undef %DbSequences;
undef %DbLength;
undef %Lambda;
undef %Kterm;
undef %Information;
undef %NHit;
undef %TargetHeader;
undef %NHsp;
undef %HspScore;
undef %HspLength;
undef %HspIdent;
undef %HspPositive;
undef %HspQuerySeq;
undef %HspTargetSeq;
undef %HspQueryBegin;
undef %HspTargetBegin;
undef %HspQueryEnd;
undef %HspTargetEnd;
}
sub clearTarget{
undef $NQuery;
$NQuery = 0;
undef %NHit;
undef %TargetHeader;
undef %NHsp;
undef %HspScore;
undef %HspLength;
undef %HspIdent;
undef %HspPositive;
undef %HspQuerySeq;
undef %HspTargetSeq;
undef %HspQueryBegin;
undef %HspTargetBegin;
undef %HspQueryEnd;
undef %HspTargetEnd;
}
sub ReadBlastOutput {
local($file) =@_;
if (!$file || $file eq "") {
open(input,"-");
} else {
open(input,$file) || die("Unable to open BLAST input ".$file);
}
$NQuery = 0;$database="";
while (<input>) {
chop;
s/,/ /g;
@t = split(' ');
($t1,$t2,$t3,$t4,$t5,$t6,$t7,$t8,$t9,$t10) = @t;
if (!$t1) { next; }
if (/^Database:\s+(\S+)/) {$database=$1;}
if ($t1 eq "Query:") {
if (!$HspQueryBegin{$NQuery,$NHit{$NQuery},$NHsp{$NQuery,$NHit{$NQuery}}} ) {
$HspQueryBegin{$NQuery,$NHit{$NQuery},$NHsp{$NQuery,$NHit{$NQuery}}} = @t[1];
}
$HspQuerySeq{$NQuery,$NHit{$NQuery},$NHsp{$NQuery,$NHit{$NQuery}}} .= @t[2];
$HspQueryEnd{$NQuery,$NHit{$NQuery},$NHsp{$NQuery,$NHit{$NQuery}}} = @t[3];
$_=<input>; last if (eof(input));
$_=<input>; last if (eof(input)); chop; @t=split(' ');
if (@t[0] ne "Sbjct:") {
print stderr "Parse error $_\n";
next;
exit(200);
}
if (!$HspTargetBegin{$NQuery,$NHit{$NQuery},$NHsp{$NQuery,$NHit{$NQuery}}} ) {
$HspTargetBegin{$NQuery,$NHit{$NQuery},$NHsp{$NQuery,$NHit{$NQuery}}} = @t[1];
}
$HspTargetSeq{$NQuery,$NHit{$NQuery},$NHsp{$NQuery,$NHit{$NQuery}}} .= @t[2];
$HspTargetEnd{$NQuery,$NHit{$NQuery},$NHsp{$NQuery,$NHit{$NQuery}}} = @t[3];
next;
}
if ($t1 eq "Score") {
$nhsp{$NQuery}++;
$NHsp{$NQuery,$NHit{$NQuery}}++;
$score = @t[2];
$score =~ s/\*//g;
$HspScore{$NQuery,$NHit{$NQuery},$NHsp{$NQuery,$NHit{$NQuery}}} = $score;
$HspPval{$NQuery,$NHit{$NQuery},$NHsp{$NQuery,$NHit{$NQuery}}} = @t[$#t];
# printf "$#t, Pval=%g\n", @t[$#t];
# Identities = 164/302 (54%), Positives = 230/302 (76%)
$_=<input>; last if (eof(input)); chop; s/\// /g;
@t=split(' ');
$HspLength{$NQuery,$NHit{$NQuery},$NHsp{$NQuery,$NHit{$NQuery}}} = @t[3];
$HspIdent{$NQuery,$NHit{$NQuery},$NHsp{$NQuery,$NHit{$NQuery}}} = @t[2];
$HspPositive{$NQuery,$NHit{$NQuery},$NHsp{$NQuery,$NHit{$NQuery}}} = @t[7];
$HspFrame{$NQuery,$NHit{$NQuery},$NHsp{$NQuery,$NHit{$NQuery}}} = @t[$#t];
$HspQueryBegin{$NQuery,$NHit{$NQuery},$NHsp{$NQuery,$NHit{$NQuery}}} = 0;
$HspTargetBegin{$NQuery,$NHit{$NQuery},$NHsp{$NQuery,$NHit{$NQuery}}} = 0;
$HspQueryEnd{$NQuery,$NHit{$NQuery},$NHsp{$NQuery,$NHit{$NQuery}}} = 0;
$HspTargetEnd{$NQuery,$NHit{$NQuery},$NHsp{$NQuery,$NHit{$NQuery}}} = 0;
next;
}
if ($t1 eq "Plus" || $t1 eq "Minus") {
next;
}
if ((substr($_,0,1) eq ">") && (substr($_,1,5) ne "Frame")) {
&PrintTarget;
&clearTarget;
$NHit{$NQuery}++;
$NHsp{$NQuery,$NHit{$NQuery}} = 0;
$TargetHeader{$NQuery,$NHit{$NQuery}} = substr($_,1);
$_=<input>; last if (eof(input)); chop; @t=split(' ');
while(@t[0] ne "Length") {
$TargetHeader{$NQuery,$NHit{$NQuery}} .= substr($_,8);
$_=<input>; last if (eof(input)); chop; @t=split(' ');
}
$TargetLength{$NQuery,$NHit{$NQuery}} = @t[2];
$TargetLambda{$NQuery,$NHit{$NQuery}} = @t[3];
$TargetK{$NQuery,$NHit{$NQuery}} = @t[4];
$TargetH{$NQuery,$NHit{$NQuery}} = @t[5];
next;
}
if ($t1 eq "Database" && $t2 eq "=") {
$DatabaseFile = $t3;
next;
}
if ($t1 eq "Sequence" && $t2 eq "file") {
$SequenceFile = $t4;
next;
}
if ($t1 eq "DatabaseFile") {
$DatabaseFile = $t2;
next;
}
if ($t1 eq "SequenceFile") {
$SequenceFile = $t2;
next;
}
if ($t1 eq "Query=") {
# do PrintOutputAndClear();
&PrintTarget;
&clearAll;
$NQuery=0;
$QueryHeader{$NQuery} = substr($_,7);
$_=<input>; last if (eof(input)); chop;
while(substr($_,7,1) ne " ") {
$QueryHeader{$NQuery} .= substr($_,7);
$_=<input>; last if (eof(input)); chop;
}
s/\(/ /;
@t = split(' ');
$QueryLength{$NQuery} = @t[0];
$NHit{$NQuery} = 0;
while (<input>){
chomp; # lose the echofilter output
last if(!$_);
}
next;
}
if ($t1 eq "Lambda" && $t2 eq "=") {
$Lambda{$NQuery} = @t[2];
next;
}
if ($t1 eq "K" && $t2 eq "=") {
$Kterm{$NQuery} = @t[2];
$Information{$NQuery} = @t[5];
next;
}
if ($t1 eq "W" && $t2 eq "=") {
$Wparam{$NQuery} = @t[2];
$Tparam{$NQuery} = @t[5];
$Xparam{$NQuery} = @t[10];
next;
}
if ($t1 eq "M" && $t2 eq "=") {
$Matrix{$NQuery} = @t[2];
next;
}
if ($t1 eq "H" && $t2 eq "=") {
$Hparam{$NQuery} = @t[2];
$Vparam{$NQuery} = @t[5];
$Bparam{$NQuery} = @t[10];
next;
}
if ($t1 eq "Database:") {
$Database{$NQuery} = substr($_,11);
$_=<input>; last if (eof(input));
s/,//;
($nq,$junk,$len) = split(' ');
if ($nq eq '#') { next; }
$DbSequences{$NQuery} = $nq;
$DbLength{$NQuery} = $len;
next;
}
if ($t1 eq '#') {
if ($t3 eq 'residues' && $t5 eq 'query:') {
$t6 =~ s/,//g;
$QueryLength{$NQuery} = $t6;
}
if ($t3 eq 'residues' && $t5 eq 'database:') {
$t6 =~ s/,//g;
$DbLength{$NQuery} = $t6;
}
}
}
&PrintTarget;
&clearAll;
# do PrintOutputAndClear();
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment