Created
March 1, 2013 14:39
-
-
Save avrilcoghlan/5065052 to your computer and use it in GitHub Desktop.
Perl script that finds TreeFam proteins that have a strong match to a family in the TreeFam mysql hmmer_matches table, but where the gene was not added to the 'fam_genes' table for the family or to any family.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/local/bin/perl | |
# | |
# Perl script treefam_QC1.pl | |
# Written by Avril Coghlan (a.coghlan:ucc.ie) | |
# 9-Dec-08. | |
# Edited 9-Jan-09. | |
# | |
# This perl script finds TreeFam proteins that have a strong match to a family | |
# in the treefam mysql hmmer_matches table, but where the gene was not added to the | |
# 'fam_genes' table for the family or to any family. | |
# | |
# The command-line format is: | |
# % perl <treefam_QC1.pl> <release> | |
# where <release> is the release of the TreeFam database to use. | |
# | |
# | |
#------------------------------------------------------------------# | |
# CHECK IF THERE ARE THE CORRECT NUMBER OF COMMAND-LINE ARGUMENTS: | |
$num_args = $#ARGV + 1; | |
if ($num_args != 1) | |
{ | |
print "Usage of treefam_QC1.pl\n\n"; | |
print "perl treefam_QC1.pl <release>\n"; | |
print "where <release> is the release of the TreeFam database to use.\n"; | |
print "For example, >perl -w treefam_QC1.pl 7\n"; | |
exit; | |
} | |
#------------------------------------------------------------------# | |
# DECLARE MYSQL USERNAME AND HOST: | |
use DBI; | |
# FIND WHICH RELEASE OF THE TREEFAM DATABASE TO USE: | |
$release = $ARGV[0]; | |
#------------------------------------------------------------------# | |
# READ IN THE PROTEINS CORRESPONDING TO EACH GENE: | |
$database = "dbi:mysql:treefam_".$release.":db.treefam.org:3308"; | |
$dbh = DBI->connect("$database", 'anonymous', '') || return; | |
$GENE = &read_proteins_for_genes($dbh); | |
# GET A LIST OF ALL GENES THAT WERE PUT INTO FAMILIES: | |
$FAMILY = &read_genes_in_families($dbh,$GENE); | |
# READ IN THE HMMER MATCHES BETWEEN GENES AND HMMS FOR FAMILIES, AND | |
# CHECK THAT GENES WITH STRONG HMMER MATCHES TO A FAMILY APPEAR IN THAT | |
# FAMILY IN THE 'fam_genes' TABLE: | |
&read_hmmer_matches_to_families($dbh,$FAMILY,$GENE); | |
# NOW DISCONNECT FROM THE DATABASE: | |
$rc = $dbh->disconnect(); | |
$rc = ""; | |
#------------------------------------------------------------------# | |
print STDERR "FINISHED.\n"; | |
#------------------------------------------------------------------# | |
# READ IN THE PROTEINS CORRESPONDING TO EACH GENE: | |
sub read_proteins_for_genes | |
{ | |
my $dbh = $_[0]; | |
my $table_w; | |
my $st; | |
my $sth; | |
my $rv; | |
my @array; | |
my $GID; | |
my $ID; | |
my %GENE = (); | |
$table_w = 'genes'; | |
$st = "SELECT GID, ID from $table_w"; | |
$sth = $dbh->prepare($st) or die "Cannot prepare $st: $dbh->errstr\n"; | |
$rv = $sth->execute or die "Cannot execute the query: $sth->errstr"; | |
if ($rv >= 1) | |
{ | |
while ((@array) = $sth->fetchrow_array) | |
{ | |
$GID = $array[0]; | |
$ID = $array[1]; | |
if ($GENE{$ID}) | |
{ | |
print STDERR "WARNING: have multiple gids for $ID\n"; | |
$GENE{$ID} = $GID; | |
} | |
else | |
{ | |
$GENE{$ID} = $GID; | |
} | |
} | |
} | |
print STDERR "Read in gene id for each protein id\n"; | |
return(\%GENE); | |
} | |
#------------------------------------------------------------------# | |
# READ IN THE HMMER MATCHES BETWEEN GENES AND HMMS FOR FAMILIES, AND | |
# CHECK THAT GENES WITH STRONG HMMER MATCHES TO A FAMILY APPEAR IN THAT | |
# FAMILY IN THE 'fam_genes' TABLE: | |
sub read_hmmer_matches_to_families | |
{ | |
my $dbh = $_[0]; | |
my $FAMILY = $_[1]; | |
my $GENE = $_[2]; | |
my $table_w; | |
my $st; | |
my $sth; | |
my $rv; | |
my @array; | |
my $AC; | |
my $ID; | |
my $SCORE; | |
my $EVALUE; | |
my $family; | |
my $evalue_cutoff = 1e-10; # xxx | |
my $GID; | |
$table_w = 'hmmer_matches'; | |
$st = "SELECT AC, ID, SCORE, EVALUE from $table_w"; | |
$sth = $dbh->prepare($st) or die "Cannot prepare $st: $dbh->errstr\n"; | |
$rv = $sth->execute or die "Cannot execute the query: $sth->errstr"; | |
if ($rv >= 1) | |
{ | |
while ((@array) = $sth->fetchrow_array) | |
{ | |
$AC = $array[0]; | |
$ID = $array[1]; | |
$SCORE = $array[2]; | |
$EVALUE = $array[3]; | |
# FIND THE GENE ID FOR PROTEIN $ID: | |
if (!($GENE->{$ID})) | |
{ | |
print STDERR "WARNING: do not know GID for protein $ID\n"; | |
} | |
else | |
{ | |
$GID = $GENE->{$ID}; | |
# CHECK IF THE GENE $ID WAS NOT PUT INTO A FAMILY: | |
if (!($FAMILY->{$GID})) | |
{ | |
# IF THE EVALUE IS VERY SIGNIFICANT FOR THE HMMER MATCH, | |
# WE SHOULD PROBABLY HAVE PUT THIS GENE INTO FAMILY $AC: | |
if ($EVALUE <= $evalue_cutoff) | |
{ | |
print STDERR "WARNING: Should have put $ID (gene $GID) into family $AC (hmmer e-value $EVALUE, score $SCORE)\n"; | |
} | |
} | |
} | |
} | |
} | |
} | |
#------------------------------------------------------------------# | |
# READ THE GENES THAT WERE PUT INTO FAMILIES: | |
sub read_genes_in_families | |
{ | |
my $dbh = $_[0]; | |
my $GENE = $_[1]; | |
my $table_w; | |
my $st; | |
my $sth; | |
my $rv; | |
my @array; | |
my $FLAG; | |
my $AC; | |
my %FAMILY = (); | |
my $GID; | |
my @temp; | |
my $i; | |
$table_w = 'fam_genes'; | |
$st = "SELECT AC, FLAG, ID from $table_w"; | |
$sth = $dbh->prepare($st) or die "Cannot prepare $st: $dbh->errstr\n"; | |
$rv = $sth->execute or die "Cannot execute the query: $sth->errstr"; | |
if ($rv >= 1) | |
{ | |
while ((@array) = $sth->fetchrow_array) | |
{ | |
$AC = $array[0]; | |
$FLAG = $array[1]; | |
$ID = $array[2]; # eg. AAEL009572-RA.1_AEDAE in TreeFam-8 | |
@temp = split(/_/,$ID); | |
$ID = ""; | |
for ($i = 0; $i <= $#temp-1; $i++) { $ID = $ID."_".$temp[$i];} | |
$ID = substr($ID,1,length($ID)-1); | |
# THIS TABLE 'fam_genes' HAS 'FULL'/'SEED' FAMILIES: | |
if ($FLAG eq 'FULL') # IF THIS IS A 'FULL' TREEFAM FAMILY (NOT A 'SEED' FAMILY). | |
{ | |
# FIND THE GENE ID CORRESPONDING TO THIS PROTEIN ID: | |
if (!($GENE->{$ID})) { print STDERR "WARNING: do not know gene for $ID AC $AC FLAG $FLAG\n"; } | |
else | |
{ | |
$GID = $GENE->{$ID}; | |
if ($FAMILY{$GID}) | |
{ | |
$FAMILY{$GID} = $FAMILY{$GID}.",".$AC; | |
} | |
else | |
{ | |
$FAMILY{$GID}= $AC; | |
} | |
} | |
} | |
} | |
} | |
print STDERR "Read in genes in each family\n"; | |
return(\%FAMILY); | |
} | |
#------------------------------------------------------------------# | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment