Skip to content

Instantly share code, notes, and snippets.

@mfcovington
Created November 27, 2012 21:25
Show Gist options
  • Save mfcovington/4157131 to your computer and use it in GitHub Desktop.
Save mfcovington/4157131 to your computer and use it in GitHub Desktop.
Perl script to filter BLAST results to separate queries that hit multiple distinct regions from those that hit a single region
#!/usr/bin/env perl
# filter_blast_rm_chimeras.pl
# Mike Covington
# created: 2012-11-26
#
# Description: filter BLAST results to separate queries that hit
# multiple distinct regions from those that hit a single region
#
use strict;
use warnings;
use autodie;
use feature 'say';
use List::Util qw(min max);
# this is a somewhat arbitrary limit for
# the ratio of: ( hit size + introns/gaps ) / query size
my $range_ratio = 5;
die "\tUsage: $0 BLAST_results_file\n" unless @ARGV == 1;
my $blast_file = $ARGV[0];
open my $blast_fh, "<", $blast_file;
# build hash of blast results (NOTE: expects BLAST format __?__!)
my %blast_hits;
while (<$blast_fh>) {
chomp;
my @delim = split /\t/;
push @{ $blast_hits{ $delim[0] }{ $delim[1] } }, {
'qry_start' => $delim[6],
'qry_stop' => $delim[7],
'hit_start' => $delim[8],
'hit_stop' => $delim[9],
'full_line' => $_,
};
}
open my $out_single_fh, ">", "$blast_file.single_region";
open my $out_multi_fh, ">", "$blast_file.multi_region";
for my $transcript ( sort { substr( $a, 3 ) <=> substr( $b, 3 ) } keys %blast_hits ) {
my ( $chr, @other_chrs ) = sort keys $blast_hits{$transcript};
# deal with queries that hit multiple chromosomes
if ( scalar keys $blast_hits{$transcript} > 1 ) {
for my $multi_chr ( $chr, @other_chrs ) {
say $out_multi_fh $blast_hits{$transcript}{$multi_chr}[$_]->{'full_line'}
for 0 .. $#{ $blast_hits{$transcript}{$multi_chr} };
}
next;
}
# deal with queries with multi hits in distant regions on single chromosome
if ( scalar @{ $blast_hits{$transcript}{$chr} } > 1 ) {
my @hit_starts_stops;
push @hit_starts_stops,
$blast_hits{$transcript}{$chr}[$_]->{'hit_start'},
$blast_hits{$transcript}{$chr}[$_]->{'hit_stop'}
for 0 .. $#{ $blast_hits{$transcript}{$chr} };
my $hit_range = max(@hit_starts_stops) - min(@hit_starts_stops);
my @qry_starts_stops;
push @qry_starts_stops,
$blast_hits{$transcript}{$chr}[$_]->{'qry_start'},
$blast_hits{$transcript}{$chr}[$_]->{'qry_stop'}
for 0 .. $#{ $blast_hits{$transcript}{$chr} };
my $qry_range = max(@qry_starts_stops) - min(@qry_starts_stops);
if ( $hit_range / $qry_range > $range_ratio ) {
say $out_multi_fh $blast_hits{$transcript}{$chr}[$_]->{'full_line'}
for 0 .. $#{ $blast_hits{$transcript}{$chr} };
next;
}
}
# write queries with single-region hits that passed filters
say $out_single_fh $blast_hits{$transcript}{$chr}[$_]->{'full_line'}
for 0 .. $#{ $blast_hits{$transcript}{$chr} };
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment