Created
January 16, 2014 16:51
-
-
Save avrilcoghlan/8458546 to your computer and use it in GitHub Desktop.
AvrilHMMUtils.pm
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package HelminthGenomeAnalysis::AvrilHMMUtils; | |
use strict; | |
use warnings; | |
use Bio::Seq; | |
use Bio::SeqIO; | |
use Moose; | |
use Math::Round; # HAS THE nearest() FUNCTION | |
use Carp::Assert; # HAS THE assert() FUNCTION | |
use Scalar::Util qw(looks_like_number); | |
use base 'Exporter'; | |
our @EXPORT_OK = qw( count_hmmpfam_queries count_signif_matches read_cegma_fams ); | |
#------------------------------------------------------------------# | |
# SUBROUTINE SYNOPSIS: count_hmmpfam_queries: SUBROUTINE TO COUNT THE NUMBER OF QUERIES FOR WHICH THERE IS OUTPUT IN A hmmpfam OUTPUT FILE | |
sub count_hmmpfam_queries | |
{ | |
my $file = $_[0]; # hmmpfam OUTPUT FILE | |
my $num_queries = 0; # NUMBER OF QUERIES FOR WHICH THERE IS OUTPUT IN $file | |
my $line; # | |
my @temp; # | |
my $query; # QUERY NAME | |
my %SEEN = (); # HASH TABLE OF QUERIES WE HAVE SEEN ALREADY | |
open(FILE,"$file") || die "ERROR: count_hmmpfam_queries: cannot open $file\n"; | |
while(<FILE>) | |
{ | |
$line = $_; | |
my $substr = substr($line,0,15); | |
if (substr($line,0,15) eq 'Query sequence:') | |
{ | |
chomp $line; | |
@temp = split(/\s+/,$line); | |
$query = $temp[2]; | |
# THROW AN EXCEPTION IF WE HAVE SEEN THIS QUERY ALREADY: | |
throw Error::Simple("ERRORCODE=1: count_hmmpfam_queries: seen query $query already in $file") if ($SEEN{$query}); # TESTED FOR | |
$SEEN{$query} = 1; | |
$num_queries++; | |
} | |
} | |
close(FILE); | |
return($num_queries); | |
} | |
#------------------------------------------------------------------# | |
# SUBROUTINE SYNOPSIS: count_signif_matches: SUBROUTINE TO COUNT THE NUMBER OF QUERIES THAT HAVE SIGNIFICANT hmmpfam HITS (WITH EVALUE <= evalue_cutoff), AND NUMBER OF HMMs HIT WITH EVALUE <= evalue_cutoff: | |
sub count_signif_matches | |
{ | |
my $file = $_[0]; # hmmpfam OUTPUT FILE | |
my $evalue_cutoff = $_[1]; # THE EVALUE CUTOFF | |
my $CEGMA = $_[2]; # HASH TABLE OF THE 248 SINGLE CEGMA GENE FAMILIES | |
my $num_signif_matches = 0; # NUMBER OF QUERIES WITH SIGNIFICANT HITS | |
my $line; # | |
my @temp; # | |
my $seen_header = 0; # SAYS WHETHER WE HAVE SEEN THE HEADER LINE THAT APPEARS BEFORE RESULTS ARE REPORTED | |
my $lines_after_header = 0; # NUMBER OF LINES AFTER THE HEADER LINE | |
my $has_signif_match = 0; # SAYS WHETHER A PARTICULAR QUERY HAS A SIGNIFICANT MATCH | |
my $evalue; # EVALUE FOR MATCH | |
my $num_hmms_hit = 0; # NUMBER OF HMMS THAT ARE HIT WITH EVALUE <= evalue | |
my %SEEN_HMM = (); # HASH TABLE TO RECORD WHICH HMMs WE HAVE SEEN | |
my $hmm; # HMM NAME | |
open(FILE,"$file") || die "ERROR: count_signif_matches: cannot open $file\n"; | |
while(<FILE>) | |
{ | |
$line = $_; | |
chomp $line; | |
if ($line eq 'Model Description Score E-value N ') | |
{ | |
$seen_header = 1; | |
$lines_after_header = 0; | |
$has_signif_match = 0; | |
} | |
if ($seen_header == 1 && ($line eq '' || $line =~ /no hits above thresholds/)) | |
{ | |
$seen_header = 0; | |
if ($has_signif_match == 1) { $num_signif_matches++;} | |
} | |
if ($seen_header == 1) | |
{ | |
$lines_after_header++; | |
if ($lines_after_header >= 3) | |
{ | |
# eg. KOG0659 -120.6 1.6e-09 1 | |
@temp = split(/\s+/,$line); | |
$hmm = $temp[0]; | |
$evalue = $temp[2]; | |
# IF THIS IS ONE OF THE 248 SINGLE GENE CEGMA FAMILIES: | |
if ($CEGMA->{$hmm}) | |
{ | |
assert(looks_like_number($evalue)); # THIS SHOULD NEVER HAPPEN, PROGRAM WILL DIE IF IT DOES. | |
if ($evalue <= $evalue_cutoff) | |
{ | |
# RECORD THAT THE QUERY HAS A SIGNIFICANT MATCH: | |
$has_signif_match = 1; | |
# RECORD THAT THERE IS A SIGNIFICANT MATCH TO THIS HMM: | |
$SEEN_HMM{$hmm} = 1; | |
print "$hmm $evalue\n"; | |
} | |
} | |
} | |
} | |
} | |
close(FILE); | |
$num_hmms_hit = keys %SEEN_HMM; | |
return($num_signif_matches,$num_hmms_hit); | |
} | |
#------------------------------------------------------------------# | |
# SUBROUTINE SYNOPSIS: read_cegma_fams: READ IN THE FILE OF 248 SINGLE GENE CEGMA FAMILIES, AND RETURN THEM AS A HASH: | |
sub read_cegma_fams | |
{ | |
my $cegma_dir = $_[0]; # THE DIRECTORY WHERE CEGMA WAS INSTALLED | |
my $file; # FILE OF 248 CEGMA FAMILIES | |
my $line; # | |
my @temp; # | |
my $family; # CEGMA FAMILY | |
my %CEGMA = (); # HASH TABLE OF FAMILIES | |
my $num_families = 0; # NUMBER OF FAMILIES | |
$file = $cegma_dir."/data/completeness_cutoff.tbl"; | |
open(FILE,"$file") || die "ERROR: read_cegma_fams: cannot open $file\n"; | |
while(<FILE>) | |
{ | |
$line = $_; | |
chomp $line; | |
@temp = split(/\s+/,$line); | |
$family = $temp[0]; # eg. KOG1355 | |
assert(!(defined($CEGMA{$family}))); | |
$CEGMA{$family} = 1; | |
$num_families++; | |
} | |
close(FILE); | |
assert($num_families == 248); | |
return(\%CEGMA); | |
} | |
#------------------------------------------------------------------# | |
1; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment