Created
March 1, 2013 14:54
-
-
Save avrilcoghlan/5065161 to your computer and use it in GitHub Desktop.
Perl script that finds TreeFam proteins that were added to a particular family, but actually have a stronger hmmer match to a different family.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/local/bin/perl | |
# | |
# Perl script treefam_QC10.pl | |
# Written by Avril Coghlan ([email protected]) | |
# 4-Feb-09. | |
# | |
# This perl script finds TreeFam proteins were added to a particular | |
# family, but actually have a stronger hmmer match to a different family. | |
# | |
# The command-line format is: | |
# % perl <treefam_QC10.pl> <release> | |
# where <release> is the release of the TreeFam database to use. | |
# | |
# | |
#------------------------------------------------------------------# | |
# CHECK IF THERE ARE THE CORRECT NUMBER OF COMMAND-LINE ARGUMENTS: | |
$num_args = $#ARGV + 1; | |
if ($num_args != 1) | |
{ | |
print "Usage of treefam_QC10.pl\n\n"; | |
print "perl treefam_QC10.pl <release>\n"; | |
print "where <release> is the release of the TreeFam database to use.\n"; | |
print "For example, >perl -w treefam_QC10.pl 7\n"; | |
exit; | |
} | |
#------------------------------------------------------------------# | |
# DECLARE MYSQL USERNAME AND HOST: | |
use DBI; | |
# FIND WHICH RELEASE OF THE TREEFAM DATABASE TO USE: | |
$release = $ARGV[0]; | |
#------------------------------------------------------------------# | |
$database = "dbi:mysql:treefam_".$release.":db.treefam.org:3308"; | |
$dbh = DBI->connect("$database", 'anonymous', '') || return; | |
# READ IN THE HMMER MATCHES BETWEEN TRANSCRIPTS AND HMMS FOR FAMILIES: | |
($HMM,$HMMEVAL,$HMMEVAL2,$HMMSCORE,$HMMSCORE2) = &read_hmmer_matches_to_families($dbh); | |
# GET A LIST OF ALL TRANSCRIPTS THAT WERE PUT INTO FAMILIES, AND CHECK | |
# THAT IF A TRANSCRIPT WAS PUT INTO A FAMILY, THAT IT DOESN'T HAVE | |
# A STRONGER HMMER MATCH TO ANOTHER FAMILY: | |
&read_transcripts_in_families($dbh,$HMM,$HMMEVAL,$HMMEVAL2,$HMMSCORE,$HMMSCORE2); | |
# NOW DISCONNECT FROM THE DATABASE: | |
$rc = $dbh->disconnect(); | |
$rc = ""; | |
#------------------------------------------------------------------# | |
print STDERR "FINISHED.\n"; | |
#------------------------------------------------------------------# | |
# READ IN THE HMMER MATCHES BETWEEN TRANSCRIPTS AND HMMS FOR FAMILIES: | |
sub read_hmmer_matches_to_families | |
{ | |
my $dbh = $_[0]; | |
my $table_w; | |
my $st; | |
my $sth; | |
my $rv; | |
my @array; | |
my $AC; | |
my $ID; | |
my $SCORE; | |
my $EVALUE; | |
my %HMM = (); | |
my %HMMEVAL = (); | |
my %HMMSCORE = (); | |
my $hmmeval; | |
my $hmmscore; | |
my %HMMEVAL2 = (); | |
my %HMMSCORE2 = (); | |
$table_w = 'hmmer_matches'; | |
$st = "SELECT AC, ID, SCORE, EVALUE from $table_w"; | |
$sth = $dbh->prepare($st) or die "Cannot prepare $st: $dbh->errstr\n"; | |
$rv = $sth->execute or die "Cannot execute the query: $sth->errstr"; | |
if ($rv >= 1) | |
{ | |
while ((@array) = $sth->fetchrow_array) | |
{ | |
$AC = $array[0]; | |
$ID = $array[1]; | |
$SCORE = $array[2]; | |
$EVALUE = $array[3]; | |
# RECORD THE BEST HMMER MATCH FOR THIS TRANSCRIPT: | |
if (!($HMM{$ID})) | |
{ | |
$HMM{$ID} = $AC; | |
if ($EVALUE == 0) { $EVALUE = "zero";} | |
if ($SCORE == 0) { $SCORE = "zero";} | |
$HMMEVAL{$ID} = $EVALUE; | |
$HMMSCORE{$ID} = $SCORE; | |
$HMMEVAL2{$ID."_".$AC} = $EVALUE; | |
$HMMSCORE2{$ID."_".$AC} = $SCORE; | |
} | |
else | |
{ | |
if (!($HMMEVAL{$ID})) { print STDERR "ERROR: do not know hmmeval for $ID HMM $HMM{$ID}\n"; exit;} | |
$hmmeval = $HMMEVAL{$ID}; | |
if (!($HMMSCORE{$ID})) { print STDERR "ERROR: do not know hmmscore for $ID HMM $HMM{$ID}\n"; exit;} | |
$hmmscore = $HMMSCORE{$ID}; | |
if ($hmmeval eq 'zero') { $hmmeval = 0;} | |
if ($hmmscore eq 'zero') { $hmmscore = 0;} | |
if ($EVALUE < $hmmeval || ($EVALUE == $hmmeval && $SCORE > $hmmscore)) | |
{ | |
$HMM{$ID} = $AC; | |
if ($EVALUE == 0) { $EVALUE = "zero";} | |
if ($SCORE == 0) { $SCORE = "zero";} | |
$HMMEVAL{$ID} = $EVALUE; | |
$HMMSCORE{$ID} = $SCORE; | |
$HMMEVAL2{$ID."_".$AC} = $EVALUE; | |
$HMMSCORE2{$ID."_".$AC} = $SCORE; | |
} | |
else | |
{ | |
if ($EVALUE == 0) { $EVALUE = "zero";} | |
if ($SCORE == 0) { $SCORE = "zero";} | |
$HMMEVAL2{$ID."_".$AC} = $EVALUE; | |
$HMMSCORE2{$ID."_".$AC} = $SCORE; | |
} | |
} | |
} | |
} | |
print STDERR "Read in HMM matches for transcripts\n"; | |
print "Read in HMM matches for transcripts\n"; | |
return(\%HMM,\%HMMEVAL,\%HMMEVAL2,\%HMMSCORE,\%HMMSCORE2); | |
} | |
#------------------------------------------------------------------# | |
# READ THE TRANSCRIPTS THAT WERE PUT INTO FAMILIES: | |
sub read_transcripts_in_families | |
{ | |
my $dbh = $_[0]; | |
my $HMM = $_[1]; | |
my $HMMEVAL = $_[2]; | |
my $HMMEVAL2 = $_[3]; | |
my $HMMSCORE = $_[4]; | |
my $HMMSCORE2 = $_[5]; | |
my $table_w; | |
my $st; | |
my $sth; | |
my $rv; | |
my @array; | |
my $FLAG; | |
my $AC; | |
my $AVeval; | |
my $ACscore; | |
my $ID; | |
my $family; | |
my $familyeval; | |
my $familyscore; | |
$table_w = 'fam_genes'; | |
$st = "SELECT AC, FLAG, ID from $table_w"; | |
$sth = $dbh->prepare($st) or die "Cannot prepare $st: $dbh->errstr\n"; | |
$rv = $sth->execute or die "Cannot execute the query: $sth->errstr"; | |
if ($rv >= 1) | |
{ | |
while ((@array) = $sth->fetchrow_array) | |
{ | |
$AC = $array[0]; | |
$FLAG = $array[1]; | |
$ID = $array[2]; | |
# THIS TABLE 'fam_genes' HAS 'FULL'/'SEED' FAMILIES: | |
if ($FLAG eq 'FULL') # IF THIS IS A 'FULL' TREEFAM FAMILY (NOT A 'SEED' FAMILY). | |
{ | |
# CHECK THAT THIS IS THE FAMILY THAT $ID HAS THE STRONGEST | |
# HMMER MATCH TO: | |
if ($HMM->{$ID}) | |
{ | |
$family = $HMM->{$ID}; | |
if (!($HMMEVAL->{$ID})) { print STDERR "ERROR: do not know evalue for $ID\n"; exit;} | |
if (!($HMMSCORE->{$ID})) { print STDERR "ERROR: do not know score for $ID\n"; exit;} | |
$familyeval = $HMMEVAL->{$ID}; | |
$familyscore = $HMMSCORE->{$ID}; | |
if ($familyeval eq 'zero') { $familyeval = 0;} | |
if ($familyscore eq 'zero') { $familyscore = 0;} | |
if (!($HMMEVAL2->{$ID."_".$AC})) { print STDERR "ERROR: do not know evalue for $ID $AC\n"; exit;} | |
if (!($HMMSCORE2->{$ID."_".$AC})) { print STDERR "ERROR: do not know score for $ID $AC\n"; exit;} | |
$ACeval = $HMMEVAL2->{$ID."_".$AC}; | |
$ACscore = $HMMSCORE2->{$ID."_".$AC}; | |
if ($ACeval eq 'zero') { $ACeval = 0;} | |
if ($ACscore eq 'zero') { $ACscore = 0;} | |
if ($family ne $AC) | |
{ | |
print "WARNING: $ID was added to family $AC (eval $ACeval, score $ACscore) but has stronger match to $family (eval $familyeval, score $familyscore)\n"; | |
} | |
} | |
else | |
{ | |
print "WARNING: $ID was added to family $AC but has no hmmer match to any family\n"; | |
} | |
} | |
} | |
} | |
} | |
#------------------------------------------------------------------# | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment