Created
January 16, 2014 16:49
-
-
Save avrilcoghlan/8458496 to your computer and use it in GitHub Desktop.
Perl script that takes a hmmpfam output file, and counts the number of queries that have hits with evalue <= evalue_cutoff, and number of HMMs hit with evalue <= evalue_cutoff.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env perl | |
=head1 NAME | |
parse_hmmpfam_output.pl | |
=head1 SYNOPSIS | |
parse_hmmpfam_output.pl hmmpfam protein_fasta evalue_cutoff total_num_hmms cegma_dir | |
where hmmpfam is the hmmpfam output file, | |
protein_fasta is the fasta file of proteins for your species, | |
evalue_cutoff is the evalue cutoff to use, | |
total_num_hmms is the number of HMMs searched against (usually 458 for CEGMA KOGs), xxx | |
cegma_dir is the directory where CEGMA was installed. | |
=head1 DESCRIPTION | |
This script takes a hmmpfam output file, and counts the number of queries that have hits with evalue <= evalue_cutoff, | |
and number of HMMs hit with evalue <= evalue_cutoff. | |
=head1 VERSION | |
Perl script last edited 24-Jul-2013. | |
=head1 CONTACT | |
[email protected] (Avril Coghlan) | |
=cut | |
# | |
# Perl script parse_hmmpfam_output.pl | |
# Written by Avril Coghlan ([email protected]) | |
# 24-Jul-13. | |
# Last edited 24-Jul-2013. | |
# SCRIPT SYNOPSIS: parse_hmmpfam_output.pl: given a hmmpfam output file, counts the number of queries with hits of evalue <= evalue_cutoff, and number of HMMs hit with evalue <= evalue_cutoff. | |
# | |
#------------------------------------------------------------------# | |
# CHECK IF THERE ARE THE CORRECT NUMBER OF COMMAND-LINE ARGUMENTS: | |
use strict; | |
use warnings; | |
# xxx | |
BEGIN { | |
unshift (@INC, '/nfs/users/nfs_a/alc/Documents/git/helminth_scripts/modules'); | |
} | |
use HelminthGenomeAnalysis::AvrilHMMUtils; | |
use HelminthGenomeAnalysis::AvrilFastaUtils; | |
use Carp::Assert; # HAS THE assert() FUNCTION | |
use Math::Round; | |
my $num_args = $#ARGV + 1; | |
if ($num_args != 5) | |
{ | |
print "Usage of parse_hmmpfam_output.pl\n\n"; | |
print "perl parse_hmmpfam_output.pl <hmmpfam> <protein_fasta> <evalue_cutoff> <total_num_hmms> <cegma_dir>\n"; | |
print "where <hmmpfam> is the hmmpfam output file,\n"; | |
print " <protein_fasta> is the fasta file of proteins for your species,\n"; | |
print " <evalue_cutoff> is the e-value cutoff to use,\n"; | |
print " <total_number_hmms> is the total number of HMMs searched against (usually 458 for CEGMA KOGs),\n"; # xxx don't need? | |
print " <cegma_dir> is the directory where CEGMA was installed\n"; | |
print "For example, >perl parse_hmmpfam_output.pl ANCCEY_cegma.output ANCCEY.protein.fa 1e-05 458\n"; | |
print "/nfs/users/nfs_m/mz3/bin/cegma_v2/\n"; | |
exit; | |
} | |
# FIND THE PATH TO THE INPUT HMMPFAM FILE: | |
my $hmmpfam = $ARGV[0]; | |
# FIND THE FASTA FILE OF PROTEINS FOR YOUR SPECIES: | |
my $protein_fasta = $ARGV[1]; | |
# FIND THE EVALUE CUTOFF TO USE: | |
my $evalue_cutoff = $ARGV[2]; | |
# FIND THE TOTAL NUMBER OF HMMs SEARCHED AGAINST: | |
my $total_num_hmms = $ARGV[3]; | |
# FIND THE DIRECTORY WHERE CEGMA WAS INSTALLED: | |
my $cegma_dir = $ARGV[4]; | |
#------------------------------------------------------------------# | |
# RUN THE MAIN PART OF THE CODE: | |
&run_main_program($hmmpfam,$protein_fasta,$evalue_cutoff,$total_num_hmms,$cegma_dir); | |
print STDERR "FINISHED.\n"; | |
#------------------------------------------------------------------# | |
# RUN THE MAIN PART OF THE CODE: | |
sub run_main_program | |
{ | |
my $hmmpfam = $_[0]; # THE INPUT HMMPFAM FILE | |
my $protein_fasta = $_[1]; # THE FASTA FILE OF PROTEIN SEQUENCES | |
my $evalue_cutoff = $_[2]; # THE EVALUE CUTOFF TO USE | |
my $total_num_hmms = $_[3]; # TOTAL NUMBER OF HMMs SEARCHED AGAINST xxx need? | |
my $cegma_dir = $_[4]; # DIRECTORY WHERE CEGMA WAS INSTALLED | |
my $protein_fasta_obj; # OBJECT FOR THE INPUT FASTA FILE OF PROTEIN SEQUENCES | |
my $num_hmmpfam_queries; # NUMBER OF QUERIES SEEN IN $hmmpfam | |
my $num_protein_queries; # NUMBER OF PROTEIN SEQUENCES IN $protein_fasta | |
my $num_signif_matches; # NUMBER OF QUERIES WITH SIGNIFICANT hmmpfam HITS | |
my $num_hmms_hit; # NUMBER OF HMMs THAT ARE HIT WITH SIGNIFICANT MATCHES | |
my $pc_hmms_hit; # PERCENT OF $total_num_hmms THAT ARE HIT WITH SIGNIFICANT MATCHES | |
my $CEGMA; # HASH TABLE OF SINGLE GENE CEGMA FAMILIES | |
my $num_conserved_hmms; # NUMBER OF SINGLE GENE CEGMA FAMILIES | |
# READ IN THE FILE OF 248 SINGLE GENE CEGMA FAMILIES: | |
$CEGMA = HelminthGenomeAnalysis::AvrilHMMUtils::read_cegma_fams($cegma_dir); | |
$num_conserved_hmms = keys (%{$CEGMA}); | |
assert($num_conserved_hmms == 248); | |
# READ IN THE SEQUENCES IN THE INPUT FASTA FILE OF PROTEIN SEQUENCES: | |
$protein_fasta_obj = HelminthGenomeAnalysis::AvrilFastaUtils->new(fasta_file => $protein_fasta); | |
# COUNT THE NUMBER OF SEQUENCES IN THE INPUT FASTA FILE OF PROTEIN SEQUENCES: | |
$num_protein_queries = HelminthGenomeAnalysis::AvrilFastaUtils::count_number_seqs($protein_fasta_obj); | |
# COUNT THE NUMBER OF QUERIES IN THE INPUT HMMPFAM FILE: | |
$num_hmmpfam_queries = HelminthGenomeAnalysis::AvrilHMMUtils::count_hmmpfam_queries($hmmpfam,$evalue_cutoff); | |
# CHECK $num_protein_queries IS EQUAL TO $num_hmmpfam_queries: | |
if ($num_protein_queries != $num_hmmpfam_queries) | |
{ | |
print STDERR "ERROR: run_main_program: have $num_protein_queries in $protein_fasta, but $num_hmmpfam_queries queries in $hmmpfam, so hmmpfam may not have run to completion!\n"; | |
} | |
# COUNT THE NUMBER OF QUERIES THAT HAVE SIGNIFICANT MATCHES IN THE $hmmpfam FILE, AND NUMBER OF HMMs THAT ARE HIT WITH SIGNIFICANT MATCHES: | |
($num_signif_matches,$num_hmms_hit) = HelminthGenomeAnalysis::AvrilHMMUtils::count_signif_matches($hmmpfam,$evalue_cutoff,$CEGMA); | |
$pc_hmms_hit = $num_hmms_hit*100/$num_conserved_hmms; | |
$pc_hmms_hit = nearest(0.1, $pc_hmms_hit); # round to 0.1 precision | |
print "$hmmpfam: $num_signif_matches query sequences have significant matches (with e-value <= $evalue_cutoff), and $num_hmms_hit ($pc_hmms_hit %) HMMs were hit with e-value <= $evalue_cutoff\n"; | |
} | |
#------------------------------------------------------------------# | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment