Created
March 1, 2013 14:52
-
-
Save avrilcoghlan/5065152 to your computer and use it in GitHub Desktop.
Perl script that finds TreeFam transcripts that appear in the 'fam_genes' table of the TreeFam mysql database, but do not appear in the 'genes' table.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/local/bin/perl | |
# | |
# Perl script treefam_QC9.pl | |
# Written by Avril Coghlan ([email protected]) | |
# 3-Feb-09. | |
# | |
# This perl script finds TreeFam transcripts that appear in the fam_genes | |
# table, but do not appear in the 'genes' table. | |
# | |
# The command-line format is: | |
# % perl <treefam_QC9.pl> <release> | |
# where <release> is the release of the TreeFam database to use. | |
# | |
# | |
#------------------------------------------------------------------# | |
# CHECK IF THERE ARE THE CORRECT NUMBER OF COMMAND-LINE ARGUMENTS: | |
$num_args = $#ARGV + 1; | |
if ($num_args != 1) | |
{ | |
print "Usage of treefam_QC9.pl\n\n"; | |
print "perl treefam_QC9.pl <release>\n"; | |
print "where <release> is the release of the TreeFam database to use.\n"; | |
print "For example, >perl -w treefam_QC9.pl 7\n"; | |
exit; | |
} | |
#------------------------------------------------------------------# | |
# DECLARE MYSQL USERNAME AND HOST: | |
use DBI; | |
# FIND WHICH RELEASE OF THE TREEFAM DATABASE TO USE: | |
$release = $ARGV[0]; | |
#------------------------------------------------------------------# | |
$database = "dbi:mysql:treefam_".$release.":db.treefam.org:3308"; | |
$dbh = DBI->connect("$database", 'anonymous', '') || return; | |
# GET A LIST OF ALL THE TRANSCRIPTS IN THE 'genes' table: | |
$GENES = &read_genes_table($dbh); | |
# CHECK THAT ALL THE TRANSCRIPTS THAT WERE PUT IN FAMILIES APPEAR | |
# IN THE genes TABLE: | |
&read_transcripts_in_families($dbh,$GENES); | |
# NOW DISCONNECT FROM THE DATABASE: | |
$rc = $dbh->disconnect(); | |
$rc = ""; | |
#------------------------------------------------------------------# | |
print STDERR "FINISHED.\n"; | |
#------------------------------------------------------------------# | |
# GET A LIST OF ALL THE TRANSCRIPTS IN THE 'genes' TABLE: | |
sub read_genes_table | |
{ | |
my $dbh = $_[0]; | |
my $table_w; | |
my $st; | |
my $sth; | |
my $rv; | |
my @array; | |
my $ID; | |
my %GENES = (); | |
$table_w = 'genes'; | |
$st = "SELECT ID from $table_w"; | |
$sth = $dbh->prepare($st) or die "Cannot prepare $st: $dbh->errstr\n"; | |
$rv = $sth->execute or die "Cannot execute the query: $sth->errstr"; | |
if ($rv >= 1) | |
{ | |
while ((@array) = $sth->fetchrow_array) | |
{ | |
$ID = $array[0]; | |
$GENES{$ID} = 1; | |
} | |
} | |
print STDERR "Read in transcripts in the genes table\n"; | |
print "Read in transcripts in the genes table\n"; | |
return(\%GENES); | |
} | |
#------------------------------------------------------------------# | |
# CHECK THAT ALL TRANSCRIPTS PUT INTO FAMILIES APPEAR IN THE genes | |
# TABLE: | |
sub read_transcripts_in_families | |
{ | |
my $dbh = $_[0]; | |
my $GENES = $_[1]; | |
my $table_w; | |
my $st; | |
my $sth; | |
my $rv; | |
my @array; | |
my $ID; | |
my $AC; | |
my $FLAG; | |
$table_w = 'fam_genes'; | |
$st = "SELECT ID, AC, FLAG from $table_w"; | |
$sth = $dbh->prepare($st) or die "Cannot prepare $st: $dbh->errstr\n"; | |
$rv = $sth->execute or die "Cannot execute the query: $sth->errstr"; | |
if ($rv >= 1) | |
{ | |
while ((@array) = $sth->fetchrow_array) | |
{ | |
$ID = $array[0]; | |
$AC = $array[1]; | |
$FLAG = $array[2]; | |
if (!($GENES->{$ID})) | |
{ | |
print "WARNING: transcript $ID is in fam_genes table in $AC (FLAG $FLAG), but not in the genes table\n"; | |
} | |
} | |
} | |
} | |
#------------------------------------------------------------------# | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment