Created
March 1, 2013 14:46
-
-
Save avrilcoghlan/5065096 to your computer and use it in GitHub Desktop.
Perl script that finds cases where a full gene set for a species was loaded into the TreeFam mysql database, but no genes from that species were added to families.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/local/bin/perl | |
# | |
# Perl script treefam_QC5.pl | |
# Written by Avril Coghlan ([email protected]) | |
# 4-Feb-09. | |
# Updated 26-June-09 to work with TreeFam release 8. | |
# | |
# This perl script finds cases where a full gene set for a species was | |
# loaded into the TreeFam mysql database, but no genes from that species | |
# were added to families. | |
# | |
# The command-line format is: | |
# % perl <treefam_QC5.pl> <release> | |
# where <release> is the release of the TreeFam database to use. | |
# | |
# | |
#------------------------------------------------------------------# | |
# CHECK IF THERE ARE THE CORRECT NUMBER OF COMMAND-LINE ARGUMENTS: | |
$num_args = $#ARGV + 1; | |
if ($num_args != 1) | |
{ | |
print "Usage of treefam_QC5.pl\n\n"; | |
print "perl treefam_QC5.pl <release>\n"; | |
print "where <release> is the release of the TreeFam database to use.\n"; | |
print "For example, >perl -w treefam_QC5.pl 7\n"; | |
exit; | |
} | |
#------------------------------------------------------------------# | |
# DECLARE MYSQL USERNAME AND HOST: | |
use DBI; | |
# FIND WHICH RELEASE OF THE TREEFAM DATABASE TO USE: | |
$release = $ARGV[0]; | |
#------------------------------------------------------------------# | |
# READ THE LIST OF FULLY SEQUENCED SPECIES THAT WERE LOADED INTO THE | |
# TREEFAM MYSQL DATABASE: | |
$database = "dbi:mysql:treefam_".$release.":db.treefam.org:3308"; | |
$dbh = DBI->connect("$database", 'anonymous', '') || return; | |
($SPECIES,$SWCODE) = &read_fully_sequenced_species($dbh); | |
# FOR EACH OF THE FULLY SEQUENCED SPECIES LOADED INTO THE TREEFAM MYSQL | |
# DATABASE, SEE HOW MANY OF ITS GENES WERE PUT INTO FAMILIES: | |
foreach $TAX_ID (keys %{$SPECIES}) | |
{ | |
$taxname = $SPECIES->{$TAX_ID}; # eg. Aedes aegypti | |
if (!($SWCODE->{$TAX_ID})) { print STDERR "ERROR: do not know swcode for $TAX_ID\n"; exit;} | |
$swcode = $SWCODE->{$TAX_ID}; | |
# CHECK IF THE GENES FROM THIS SPECIES WERE PUT INTO FAMILIES: | |
($num_genes,$num_genes_in_families) = &read_genes_in_families($dbh,$TAX_ID,$taxname,$swcode,$release); | |
if ($num_genes > 0) | |
{ | |
$pc_genes_in_families = $num_genes_in_families*100/$num_genes; | |
} | |
else | |
{ | |
$pc_genes_in_families = 0; | |
} | |
if ($pc_genes_in_families < 70) | |
{ | |
print "WARNING: of $num_genes $taxname genes, $pc_genes_in_families % ($num_genes_in_families genes) are put into families\n"; | |
} | |
} | |
# NOW DISCONNECT FROM THE DATABASE: | |
$rc = $dbh->disconnect(); | |
$rc = ""; | |
#------------------------------------------------------------------# | |
print STDERR "FINISHED.\n"; | |
#------------------------------------------------------------------# | |
# READ THE LIST OF FULLY SEQUENCED SPECIES THAT WERE LOADED INTO THE | |
# TREEFAM MYSQL DATABASE: | |
sub read_fully_sequenced_species | |
{ | |
my $dbh = $_[0]; | |
my $table_w; | |
my $st; | |
my $sth; | |
my $rv; | |
my @array; | |
my %SPECIES = (); | |
my $tax_id; | |
my $taxname; | |
my $swcode; | |
my %SWCODE = (); | |
$table_w = 'species'; | |
$st = "SELECT TAX_ID, TAXNAME, SWCODE from $table_w where FLAG=1"; | |
$sth = $dbh->prepare($st) or die "Cannot prepare $st: $dbh->errstr\n"; | |
$rv = $sth->execute or die "Cannot execute the query: $sth->errstr"; | |
if ($rv >= 1) | |
{ | |
while ((@array) = $sth->fetchrow_array) | |
{ | |
$TAX_ID = $array[0]; # eg. 7159 | |
$taxname = $array[1]; # eg. Aedes aegypti | |
$swcode = $array[2]; | |
if ($SPECIES{$TAX_ID}) | |
{ | |
if ($SPECIES{$TAX_ID} ne $taxname) | |
{ | |
print STDERR "ERROR: TAX_ID $TAX_ID taxname $taxname SPECIES(TAX_ID) is $SPECIES{$TAX_ID}\n"; | |
exit; | |
} | |
} | |
else | |
{ | |
$SPECIES{$TAX_ID} = $taxname; | |
} | |
if ($SWCODE{$TAX_ID}) | |
{ | |
if ($SWCODE{$TAX_ID} ne $swcode) | |
{ | |
print STDERR "ERROR: TAX_ID $TAX_ID swcode $swcode SWCODE(TAX_ID) is $SWCODE{$TAX_ID}\n"; | |
exit; | |
} | |
} | |
else | |
{ | |
$SWCODE{$TAX_ID} = $swcode; | |
} | |
} | |
} | |
print STDERR "Read in fully sequenced species\n"; | |
return(\%SPECIES,\%SWCODE); | |
} | |
#------------------------------------------------------------------# | |
# FOR EACH OF THE FULLY SEQUENCED SPECIES LOADED INTO THE TREEFAM MYSQL | |
# DATABASE, SEE HOW MANY OF ITS GENES WERE PUT INTO FAMILIES: | |
sub read_genes_in_families | |
{ | |
my $dbh = $_[0]; | |
my $TAX_ID = $_[1]; | |
my $taxname = $_[2]; | |
my $swcode = $_[3]; | |
my $release = $_[4]; | |
my $table_w; | |
my $st; | |
my $sth; | |
my $rv; | |
my @array; | |
my $num_genes = 0; | |
my %SEEN_GENE = (); | |
my @ids; | |
my $i; | |
my $id; | |
my $num_genes_in_families = 0; | |
my %SEEN_GENE_IN_FAMILY = (); | |
my %GID = (); | |
# FIRST READ IN ALL THE GENES FROM THIS SPECIES IN THE 'genes' TABLE: | |
$table_w = 'genes'; | |
$st = "SELECT ID, GID from $table_w WHERE TAX_ID=$TAX_ID"; | |
$sth = $dbh->prepare($st) or die "Cannot prepare $st: $dbh->errstr\n"; | |
$rv = $sth->execute or die "Cannot execute the query: $sth->errstr"; | |
if ($rv >= 1) | |
{ | |
while ((@array) = $sth->fetchrow_array) | |
{ | |
$ID = $array[0]; | |
@ids = (@ids,$ID); | |
$GID = $array[1]; | |
if (!($SEEN_GENE{$GID})) | |
{ | |
$SEEN_GENE{$GID} = 1; | |
$num_genes++; | |
} | |
if ($GID{$ID}) | |
{ | |
print STDERR "ERROR: already have GID $GID{$ID} for $ID (GID $GID)\n"; | |
exit; | |
} | |
else | |
{ | |
$GID{$ID} = $GID; | |
} | |
} | |
} | |
print STDERR "There are $num_genes genes from $taxname in the mysql \'genes\' table\n"; | |
# NOW CHECK IF EACH OF THE GENES WAS PUT INTO A FAMILY: | |
for ($i = 0; $i <= $#ids; $i++) | |
{ | |
$id = $ids[$i]; | |
if (!($GID{$id})) { print STDERR "ERROR: do not know GID for $id\n"; exit;} | |
$GID = $GID{$id}; | |
# IF IT IS RELEASE 8: | |
if ($release eq '8') { $id = $id."_".$swcode;} | |
$table_w = 'fam_genes'; | |
$st = "SELECT AC, ID from $table_w where ID=\'$id\'"; | |
$sth = $dbh->prepare($st) or die "Cannot prepare $st: $dbh->errstr\n"; | |
$rv = $sth->execute or die "Cannot execute the query: $sth->errstr"; | |
if ($rv >= 1) | |
{ | |
while ((@array) = $sth->fetchrow_array) | |
{ | |
$AC = $array[0]; | |
$ID = $array[1]; | |
if ($ID ne $id) { print STDERR "ERROR: id $id ID $ID AC $AC GID $GID\n"; exit;} | |
if (!($SEEN_GENE_IN_FAMILY{$GID})) | |
{ | |
$SEEN_GENE_IN_FAMILY{$GID} = 1; | |
$num_genes_in_families++; | |
} | |
} | |
} | |
} | |
print STDERR "There are $num_genes_in_families genes from $taxname in the mysql \'fam_genes\' table\n"; | |
return($num_genes,$num_genes_in_families); | |
} | |
#------------------------------------------------------------------# | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment