Created
March 1, 2013 14:44
-
-
Save avrilcoghlan/5065083 to your computer and use it in GitHub Desktop.
Perl script that finds TreeFam proteins that have a match to a family in the TreeFam mysql 'hmmer_matches' table, but where the gene does not appear in the 'genes' table.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/local/bin/perl | |
# | |
# Perl script treefam_QC4.pl | |
# Written by Avril Coghlan ([email protected]) | |
# 3-Feb-09. | |
# | |
# This perl script finds TreeFam proteins that have a match to a family | |
# in the treefam mysql hmmer_matches table, but where the gene does not | |
# appear in the 'genes' table. | |
# | |
# The command-line format is: | |
# % perl <treefam_QC4.pl> <release> | |
# where <release> is the release of the TreeFam database to use. | |
# | |
# | |
#------------------------------------------------------------------# | |
# CHECK IF THERE ARE THE CORRECT NUMBER OF COMMAND-LINE ARGUMENTS: | |
$num_args = $#ARGV + 1; | |
if ($num_args != 1) | |
{ | |
print "Usage of treefam_QC4.pl\n\n"; | |
print "perl treefam_QC4.pl <release>\n"; | |
print "where <release> is the release of the TreeFam database to use.\n"; | |
print "For example, >perl -w treefam_QC4.pl 7\n"; | |
exit; | |
} | |
#------------------------------------------------------------------# | |
# DECLARE MYSQL USERNAME AND HOST: | |
use DBI; | |
# FIND WHICH RELEASE OF THE TREEFAM DATABASE TO USE: | |
$release = $ARGV[0]; | |
#------------------------------------------------------------------# | |
# READ IN THE PROTEINS CORRESPONDING TO EACH GENE: | |
$database = "dbi:mysql:treefam_".$release.":db.treefam.org:3308"; | |
$dbh = DBI->connect("$database", 'anonymous', '') || return; | |
$GENE = &read_proteins_for_genes($dbh); | |
# GET A LIST OF ALL GENES THAT WERE PUT INTO FAMILIES: | |
$FAMILY = &read_genes_in_families($dbh,$GENE); | |
# READ IN THE HMMER MATCHES BETWEEN GENES AND HMMS FOR FAMILIES, AND | |
# CHECK THAT THE GENES WITH HMMER MATCHES TO A FAMILY APPEAR IN THE | |
# 'genes' TABLE: | |
&read_hmmer_matches_to_families($dbh,$FAMILY,$GENE); | |
# NOW DISCONNECT FROM THE DATABASE: | |
$rc = $dbh->disconnect(); | |
$rc = ""; | |
#------------------------------------------------------------------# | |
print STDERR "FINISHED.\n"; | |
#------------------------------------------------------------------# | |
# READ IN THE PROTEINS CORRESPONDING TO EACH GENE: | |
sub read_proteins_for_genes | |
{ | |
my $dbh = $_[0]; | |
my $table_w; | |
my $st; | |
my $sth; | |
my $rv; | |
my @array; | |
my $GID; | |
my $ID; | |
my %GENE = (); | |
$table_w = 'genes'; | |
$st = "SELECT GID, ID from $table_w"; | |
$sth = $dbh->prepare($st) or die "Cannot prepare $st: $dbh->errstr\n"; | |
$rv = $sth->execute or die "Cannot execute the query: $sth->errstr"; | |
if ($rv >= 1) | |
{ | |
while ((@array) = $sth->fetchrow_array) | |
{ | |
$GID = $array[0]; | |
$ID = $array[1]; | |
if ($GENE{$ID}) | |
{ | |
print STDERR "WARNING: have multiple gids for $ID\n"; | |
$GENE{$ID} = $GID; | |
} | |
else | |
{ | |
$GENE{$ID} = $GID; | |
} | |
} | |
} | |
print STDERR "Read in gene id for each protein id\n"; | |
return(\%GENE); | |
} | |
#------------------------------------------------------------------# | |
# READ IN THE HMMER MATCHES BETWEEN GENES AND HMMS FOR FAMILIES, AND | |
# CHECK THAT GENES WITH STRONG MATCHES TO A FAMILY APPEAR IN THE | |
# 'genes' TABLE: | |
sub read_hmmer_matches_to_families | |
{ | |
my $dbh = $_[0]; | |
my $FAMILY = $_[1]; | |
my $GENE = $_[2]; | |
my $table_w; | |
my $st; | |
my $sth; | |
my $rv; | |
my @array; | |
my $AC; | |
my $ID; | |
my $SCORE; | |
my $EVALUE; | |
my $family; | |
my $GID; | |
$table_w = 'hmmer_matches'; | |
$st = "SELECT AC, ID, SCORE, EVALUE from $table_w"; | |
$sth = $dbh->prepare($st) or die "Cannot prepare $st: $dbh->errstr\n"; | |
$rv = $sth->execute or die "Cannot execute the query: $sth->errstr"; | |
if ($rv >= 1) | |
{ | |
while ((@array) = $sth->fetchrow_array) | |
{ | |
$AC = $array[0]; | |
$ID = $array[1]; # eg. ENSPPYT00000017479.1 | |
$SCORE = $array[2]; | |
$EVALUE = $array[3]; | |
# FIND THE GENE ID FOR PROTEIN $ID: | |
if (!($GENE->{$ID})) | |
{ | |
print "WARNING: do not know GID for protein $ID\n"; | |
} | |
else | |
{ | |
$GID = $GENE->{$ID}; # eg. ENSPPYG00000015038 | |
} | |
} | |
} | |
} | |
#------------------------------------------------------------------# | |
# READ THE GENES THAT WERE PUT INTO FAMILIES: | |
sub read_genes_in_families | |
{ | |
my $dbh = $_[0]; | |
my $GENE = $_[1]; | |
my $table_w; | |
my $st; | |
my $sth; | |
my $rv; | |
my @array; | |
my $FLAG; | |
my $AC; | |
my %FAMILY = (); | |
my $GID; | |
$table_w = 'fam_genes'; | |
$st = "SELECT AC, FLAG, ID from $table_w"; | |
$sth = $dbh->prepare($st) or die "Cannot prepare $st: $dbh->errstr\n"; | |
$rv = $sth->execute or die "Cannot execute the query: $sth->errstr"; | |
if ($rv >= 1) | |
{ | |
while ((@array) = $sth->fetchrow_array) | |
{ | |
$AC = $array[0]; | |
$FLAG = $array[1]; | |
$ID = $array[2]; | |
# THIS TABLE 'fam_genes' HAS 'FULL'/'SEED' FAMILIES: | |
if ($FLAG eq 'FULL') # IF THIS IS A 'FULL' TREEFAM FAMILY (NOT A 'SEED' FAMILY). | |
{ | |
# FIND THE GENE ID CORRESPONDING TO THIS PROTEIN ID: | |
if (!($GENE->{$ID})) | |
{ | |
if (!($ID =~ /null/) && !($ID =~ /NULL/)) | |
{ | |
print "WARNING: do not know gene for $ID AC $AC FLAG $FLAG\n"; | |
} | |
} | |
else | |
{ | |
$GID = $GENE->{$ID}; | |
if ($FAMILY{$GID}) | |
{ | |
$FAMILY{$GID} = $FAMILY{$GID}.",".$AC; | |
} | |
else | |
{ | |
$FAMILY{$GID}= $AC; | |
} | |
} | |
} | |
} | |
} | |
print STDERR "Read in genes in each family\n"; | |
return(\%FAMILY); | |
} | |
#------------------------------------------------------------------# | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment