Created
March 1, 2013 14:42
-
-
Save avrilcoghlan/5065069 to your computer and use it in GitHub Desktop.
Perl script that finds cases where a TreeFam family is listed in the TreeFam mysql database, but has no genes listed in the 'fam_genes' table.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/local/bin/perl | |
# | |
# Perl script treefam_QC3.pl | |
# Written by Avril Coghlan ([email protected]) | |
# 3-Feb-09. | |
# | |
# This perl script finds cases where there is a TreeFam family listed in the | |
# 'familyA' or 'familyB' or 'familyC' tables, but it has no genes listed in | |
# the 'fam_genes' table. | |
# | |
# The command-line format is: | |
# % perl <treefam_QC3.pl> <release> | |
# where <release> is the release of the TreeFam database to use. | |
# | |
# | |
#------------------------------------------------------------------# | |
# CHECK IF THERE ARE THE CORRECT NUMBER OF COMMAND-LINE ARGUMENTS: | |
$num_args = $#ARGV + 1; | |
if ($num_args != 1) | |
{ | |
print "Usage of treefam_QC3.pl\n\n"; | |
print "perl treefam_QC3.pl <release>\n"; | |
print "where <release> is the release of the TreeFam database to use.\n"; | |
print "For example, >perl -w treefam_QC3.pl 7\n"; | |
exit; | |
} | |
#------------------------------------------------------------------# | |
# DECLARE MYSQL USERNAME AND HOST: | |
use DBI; | |
# FIND WHICH RELEASE OF THE TREEFAM DATABASE TO USE: | |
$release = $ARGV[0]; | |
#------------------------------------------------------------------# | |
# READ IN THE FAMILIIES THAT APPEAR IN THE 'fam_genes' TABLE: | |
$database = "dbi:mysql:treefam_".$release.":db.treefam.org:3308"; | |
$dbh = DBI->connect("$database", 'anonymous', '') || return; | |
$FAMILY = &read_genes_in_families($dbh); | |
# FIND IF THERE ARE FAMILIES IN THE FAMILYA/FAMILYB/FAMILYC TABLES, | |
# THAT DO NOT APPEAR IN THE fam_genes TABLE: | |
&read_all_families($dbh,$FAMILY); | |
# NOW DISCONNECT FROM THE DATABASE: | |
$rc = $dbh->disconnect(); | |
$rc = ""; | |
#------------------------------------------------------------------# | |
print STDERR "FINISHED.\n"; | |
#------------------------------------------------------------------# | |
# FIND IF THERE ARE FAMILIES IN THE FAMILYA/FAMILYB/FAMILYC TABLES, | |
# THAT DO NOT APPEAR IN THE fam_genes TABLE: | |
sub read_all_families | |
{ | |
my $dbh = $_[0]; | |
my $FAMILY = $_[1]; | |
my $table_w; | |
my $st; | |
my $sth; | |
my $rv; | |
my @array; | |
my $i; | |
my %SEEN; | |
my $key; | |
my @temp; | |
for ($i = 1; $i <= 3; $i++) | |
{ | |
if ($i == 1) { $table_w = 'familyA';} | |
elsif ($i == 2) { $table_w = 'familyB';} | |
elsif ($i == 3) { $table_w = 'familyC';} | |
$st = "SELECT AC from $table_w"; | |
$sth = $dbh->prepare($st) or die "Cannot prepare $st: $dbh->errstr\n"; | |
$rv = $sth->execute or die "Cannot execute the query: $sth->errstr"; | |
if ($rv >= 1) | |
{ | |
while ((@array) = $sth->fetchrow_array) | |
{ | |
$AC = $array[0]; | |
$SEEN{$AC} = 1; | |
# CHECK IF GENES IN THE SEED FAMILY FOR $AC APPEAR IN THE | |
# fam_genes TABLE. NOTE THAT THE GENES IN THE SEED FAMILY ARE | |
# ONLY STORED FOR TREEFAM-A (TF1XXXXX) FAMILIES: | |
if (substr($AC,0,3) eq 'TF1') | |
{ | |
if (!($FAMILY->{$AC."=SEED"})) | |
{ | |
print "WARNING: the fam_genes table doesn't contain genes in the $AC SEED family\n"; | |
} | |
} | |
# CHECK IF GENES IN THE FULL FAMILY FOR $AC APPEAR IN THE | |
# fam_genes TABLE: | |
if (!($FAMILY->{$AC."=FULL"})) | |
{ | |
print "WARNING: the fam_genes table doesn't contain genes in the $AC FULL family\n"; | |
} | |
} | |
} | |
} | |
# CHECK IF THERE ARE GENES FOR FAMILIES THAT APPEAR IN THE fam_genes | |
# TABLE, BUT WHERE THE FAMILY DOESN'T APPEAR IN THE familyA OR familyB | |
# OR familyC TABLES: | |
foreach $key (keys %{$FAMILY}) | |
{ | |
@temp = split(/=/,$key); | |
$AC = $temp[0]; | |
if (!($SEEN{$AC})) | |
{ | |
print "WARNING: family $AC appears in the fam_genes table but not in the familyA/familyB/familyC tables\n"; | |
} | |
} | |
} | |
#------------------------------------------------------------------# | |
# READ THE GENES THAT WERE PUT INTO FAMILIES: | |
sub read_genes_in_families | |
{ | |
my $dbh = $_[0]; | |
my $table_w; | |
my $st; | |
my $sth; | |
my $rv; | |
my @array; | |
my $AC; | |
my $FLAG; | |
my %FAMILY = (); | |
my $ID; | |
$table_w = 'fam_genes'; | |
$st = "SELECT AC, FLAG, ID from $table_w"; | |
$sth = $dbh->prepare($st) or die "Cannot prepare $st: $dbh->errstr\n"; | |
$rv = $sth->execute or die "Cannot execute the query: $sth->errstr"; | |
if ($rv >= 1) | |
{ | |
while ((@array) = $sth->fetchrow_array) | |
{ | |
$AC = $array[0]; | |
$FLAG = $array[1]; | |
$ID = $array[2]; | |
# SOMETIMES $ID CAN BE '_null_;;' (eg. see TF328536 in TREEFAM-7): | |
if (!($ID =~ /null/) && !($ID =~ /NULL/)) | |
{ | |
# THIS TABLE 'fam_genes' HAS 'FULL'/'SEED' FAMILIES: | |
$FAMILY{$AC."=".$FLAG} = 1; | |
} | |
} | |
} | |
print STDERR "Read in genes in each family\n"; | |
return(\%FAMILY); | |
} | |
#------------------------------------------------------------------# | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment