Skip to content

Instantly share code, notes, and snippets.

@avrilcoghlan
Created January 16, 2014 16:51
Show Gist options
  • Save avrilcoghlan/8458546 to your computer and use it in GitHub Desktop.
Save avrilcoghlan/8458546 to your computer and use it in GitHub Desktop.
AvrilHMMUtils.pm
package HelminthGenomeAnalysis::AvrilHMMUtils;
use strict;
use warnings;
use Bio::Seq;
use Bio::SeqIO;
use Moose;
use Math::Round; # HAS THE nearest() FUNCTION
use Carp::Assert; # HAS THE assert() FUNCTION
use Scalar::Util qw(looks_like_number);
use base 'Exporter';
our @EXPORT_OK = qw( count_hmmpfam_queries count_signif_matches read_cegma_fams );
#------------------------------------------------------------------#
# SUBROUTINE SYNOPSIS: count_hmmpfam_queries: SUBROUTINE TO COUNT THE NUMBER OF QUERIES FOR WHICH THERE IS OUTPUT IN A hmmpfam OUTPUT FILE
sub count_hmmpfam_queries
{
my $file = $_[0]; # hmmpfam OUTPUT FILE
my $num_queries = 0; # NUMBER OF QUERIES FOR WHICH THERE IS OUTPUT IN $file
my $line; #
my @temp; #
my $query; # QUERY NAME
my %SEEN = (); # HASH TABLE OF QUERIES WE HAVE SEEN ALREADY
open(FILE,"$file") || die "ERROR: count_hmmpfam_queries: cannot open $file\n";
while(<FILE>)
{
$line = $_;
my $substr = substr($line,0,15);
if (substr($line,0,15) eq 'Query sequence:')
{
chomp $line;
@temp = split(/\s+/,$line);
$query = $temp[2];
# THROW AN EXCEPTION IF WE HAVE SEEN THIS QUERY ALREADY:
throw Error::Simple("ERRORCODE=1: count_hmmpfam_queries: seen query $query already in $file") if ($SEEN{$query}); # TESTED FOR
$SEEN{$query} = 1;
$num_queries++;
}
}
close(FILE);
return($num_queries);
}
#------------------------------------------------------------------#
# SUBROUTINE SYNOPSIS: count_signif_matches: SUBROUTINE TO COUNT THE NUMBER OF QUERIES THAT HAVE SIGNIFICANT hmmpfam HITS (WITH EVALUE <= evalue_cutoff), AND NUMBER OF HMMs HIT WITH EVALUE <= evalue_cutoff:
sub count_signif_matches
{
my $file = $_[0]; # hmmpfam OUTPUT FILE
my $evalue_cutoff = $_[1]; # THE EVALUE CUTOFF
my $CEGMA = $_[2]; # HASH TABLE OF THE 248 SINGLE CEGMA GENE FAMILIES
my $num_signif_matches = 0; # NUMBER OF QUERIES WITH SIGNIFICANT HITS
my $line; #
my @temp; #
my $seen_header = 0; # SAYS WHETHER WE HAVE SEEN THE HEADER LINE THAT APPEARS BEFORE RESULTS ARE REPORTED
my $lines_after_header = 0; # NUMBER OF LINES AFTER THE HEADER LINE
my $has_signif_match = 0; # SAYS WHETHER A PARTICULAR QUERY HAS A SIGNIFICANT MATCH
my $evalue; # EVALUE FOR MATCH
my $num_hmms_hit = 0; # NUMBER OF HMMS THAT ARE HIT WITH EVALUE <= evalue
my %SEEN_HMM = (); # HASH TABLE TO RECORD WHICH HMMs WE HAVE SEEN
my $hmm; # HMM NAME
open(FILE,"$file") || die "ERROR: count_signif_matches: cannot open $file\n";
while(<FILE>)
{
$line = $_;
chomp $line;
if ($line eq 'Model Description Score E-value N ')
{
$seen_header = 1;
$lines_after_header = 0;
$has_signif_match = 0;
}
if ($seen_header == 1 && ($line eq '' || $line =~ /no hits above thresholds/))
{
$seen_header = 0;
if ($has_signif_match == 1) { $num_signif_matches++;}
}
if ($seen_header == 1)
{
$lines_after_header++;
if ($lines_after_header >= 3)
{
# eg. KOG0659 -120.6 1.6e-09 1
@temp = split(/\s+/,$line);
$hmm = $temp[0];
$evalue = $temp[2];
# IF THIS IS ONE OF THE 248 SINGLE GENE CEGMA FAMILIES:
if ($CEGMA->{$hmm})
{
assert(looks_like_number($evalue)); # THIS SHOULD NEVER HAPPEN, PROGRAM WILL DIE IF IT DOES.
if ($evalue <= $evalue_cutoff)
{
# RECORD THAT THE QUERY HAS A SIGNIFICANT MATCH:
$has_signif_match = 1;
# RECORD THAT THERE IS A SIGNIFICANT MATCH TO THIS HMM:
$SEEN_HMM{$hmm} = 1;
print "$hmm $evalue\n";
}
}
}
}
}
close(FILE);
$num_hmms_hit = keys %SEEN_HMM;
return($num_signif_matches,$num_hmms_hit);
}
#------------------------------------------------------------------#
# SUBROUTINE SYNOPSIS: read_cegma_fams: READ IN THE FILE OF 248 SINGLE GENE CEGMA FAMILIES, AND RETURN THEM AS A HASH:
sub read_cegma_fams
{
my $cegma_dir = $_[0]; # THE DIRECTORY WHERE CEGMA WAS INSTALLED
my $file; # FILE OF 248 CEGMA FAMILIES
my $line; #
my @temp; #
my $family; # CEGMA FAMILY
my %CEGMA = (); # HASH TABLE OF FAMILIES
my $num_families = 0; # NUMBER OF FAMILIES
$file = $cegma_dir."/data/completeness_cutoff.tbl";
open(FILE,"$file") || die "ERROR: read_cegma_fams: cannot open $file\n";
while(<FILE>)
{
$line = $_;
chomp $line;
@temp = split(/\s+/,$line);
$family = $temp[0]; # eg. KOG1355
assert(!(defined($CEGMA{$family})));
$CEGMA{$family} = 1;
$num_families++;
}
close(FILE);
assert($num_families == 248);
return(\%CEGMA);
}
#------------------------------------------------------------------#
1;
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment