Created
November 27, 2012 21:25
-
-
Save mfcovington/4157131 to your computer and use it in GitHub Desktop.
Perl script to filter BLAST results to separate queries that hit multiple distinct regions from those that hit a single region
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env perl | |
# filter_blast_rm_chimeras.pl | |
# Mike Covington | |
# created: 2012-11-26 | |
# | |
# Description: filter BLAST results to separate queries that hit | |
# multiple distinct regions from those that hit a single region | |
# | |
use strict; | |
use warnings; | |
use autodie; | |
use feature 'say'; | |
use List::Util qw(min max); | |
# this is a somewhat arbitrary limit for | |
# the ratio of: ( hit size + introns/gaps ) / query size | |
my $range_ratio = 5; | |
die "\tUsage: $0 BLAST_results_file\n" unless @ARGV == 1; | |
my $blast_file = $ARGV[0]; | |
open my $blast_fh, "<", $blast_file; | |
# build hash of blast results (NOTE: expects BLAST format __?__!) | |
my %blast_hits; | |
while (<$blast_fh>) { | |
chomp; | |
my @delim = split /\t/; | |
push @{ $blast_hits{ $delim[0] }{ $delim[1] } }, { | |
'qry_start' => $delim[6], | |
'qry_stop' => $delim[7], | |
'hit_start' => $delim[8], | |
'hit_stop' => $delim[9], | |
'full_line' => $_, | |
}; | |
} | |
open my $out_single_fh, ">", "$blast_file.single_region"; | |
open my $out_multi_fh, ">", "$blast_file.multi_region"; | |
for my $transcript ( sort { substr( $a, 3 ) <=> substr( $b, 3 ) } keys %blast_hits ) { | |
my ( $chr, @other_chrs ) = sort keys $blast_hits{$transcript}; | |
# deal with queries that hit multiple chromosomes | |
if ( scalar keys $blast_hits{$transcript} > 1 ) { | |
for my $multi_chr ( $chr, @other_chrs ) { | |
say $out_multi_fh $blast_hits{$transcript}{$multi_chr}[$_]->{'full_line'} | |
for 0 .. $#{ $blast_hits{$transcript}{$multi_chr} }; | |
} | |
next; | |
} | |
# deal with queries with multi hits in distant regions on single chromosome | |
if ( scalar @{ $blast_hits{$transcript}{$chr} } > 1 ) { | |
my @hit_starts_stops; | |
push @hit_starts_stops, | |
$blast_hits{$transcript}{$chr}[$_]->{'hit_start'}, | |
$blast_hits{$transcript}{$chr}[$_]->{'hit_stop'} | |
for 0 .. $#{ $blast_hits{$transcript}{$chr} }; | |
my $hit_range = max(@hit_starts_stops) - min(@hit_starts_stops); | |
my @qry_starts_stops; | |
push @qry_starts_stops, | |
$blast_hits{$transcript}{$chr}[$_]->{'qry_start'}, | |
$blast_hits{$transcript}{$chr}[$_]->{'qry_stop'} | |
for 0 .. $#{ $blast_hits{$transcript}{$chr} }; | |
my $qry_range = max(@qry_starts_stops) - min(@qry_starts_stops); | |
if ( $hit_range / $qry_range > $range_ratio ) { | |
say $out_multi_fh $blast_hits{$transcript}{$chr}[$_]->{'full_line'} | |
for 0 .. $#{ $blast_hits{$transcript}{$chr} }; | |
next; | |
} | |
} | |
# write queries with single-region hits that passed filters | |
say $out_single_fh $blast_hits{$transcript}{$chr}[$_]->{'full_line'} | |
for 0 .. $#{ $blast_hits{$transcript}{$chr} }; | |
} | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment