Created
March 1, 2013 14:41
-
-
Save avrilcoghlan/5065063 to your computer and use it in GitHub Desktop.
Perl script that finds TreeFam families that are lacking a tree in the 'trees' table of the TreeFam mysql database.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/local/bin/perl | |
# | |
# Perl script treefam_QC2.pl | |
# Written by Avril Coghlan ([email protected]) | |
# 3-Feb-09. | |
# | |
# This perl script finds TreeFam families that are lacking a tree in the | |
# 'trees' table. | |
# | |
# The command-line format is: | |
# % perl <treefam_QC2.pl> <release> | |
# where <release> is the release of the TreeFam database to use. | |
# | |
# | |
#------------------------------------------------------------------# | |
# CHECK IF THERE ARE THE CORRECT NUMBER OF COMMAND-LINE ARGUMENTS: | |
$num_args = $#ARGV + 1; | |
if ($num_args != 1) | |
{ | |
print "Usage of treefam_QC2.pl\n\n"; | |
print "perl treefam_QC2.pl <release>\n"; | |
print "where <release> is the release of the TreeFam database to use.\n"; | |
print "For example, >perl -w treefam_QC2.pl 7\n"; | |
exit; | |
} | |
#------------------------------------------------------------------# | |
# DECLARE MYSQL USERNAME AND HOST: | |
use DBI; | |
# FIND WHICH RELEASE OF THE TREEFAM DATABASE TO USE: | |
$release = $ARGV[0]; | |
#------------------------------------------------------------------# | |
# READ IN A LIST OF ALL THE FAMILIES FOR WHICH THERE ARE TREES | |
# IN TREEFAM: | |
$database = "dbi:mysql:treefam_".$release.":db.treefam.org:3308"; | |
$dbh = DBI->connect("$database", 'anonymous', '') || return; | |
$TREES = &read_families_with_trees($dbh); | |
# NOW READ IN ALL THE FAMILIES IN THIS RELEASE OF TREEFAM, AND CHECK | |
# THAT EACH FAMILY HAS A SEED, CLEAN AND FULL TREE: | |
&check_that_each_family_has_trees($dbh,$TREES); | |
$rc = $dbh->disconnect(); | |
$rc = ""; | |
#------------------------------------------------------------------# | |
print STDERR "FINISHED.\n"; | |
#------------------------------------------------------------------# | |
# READ IN THE FAMILIES FOR WHICH THERE ARE TREES IN TREEFAM: | |
sub read_families_with_trees | |
{ | |
my $dbh = $_[0]; | |
my $table_w; | |
my $st; | |
my $sth; | |
my $rv; | |
my @array; | |
my %TREES = (); | |
my $AC; | |
my $TYPE; | |
my $TREE; | |
# THE TREES CAN HAVE TYPE: SEED/FULL/CLEAN | |
$table_w = 'trees'; | |
$st = "SELECT AC, TYPE, TREE from $table_w"; | |
$sth = $dbh->prepare($st) or die "Cannot prepare $st: $dbh->errstr\n"; | |
$rv = $sth->execute or die "Cannot execute the query: $sth->errstr"; | |
if ($rv >= 1) | |
{ | |
while ((@array) = $sth->fetchrow_array) | |
{ | |
$AC = $array[0]; | |
$TYPE = $array[1]; | |
$TREE = $array[2]; | |
if ($TREE =~ /\(/) # IF THE TREE IS NOT EMPTY | |
{ | |
if ($TREES{$AC."=".$TYPE}) | |
{ | |
print "WARNING: Already have $TYPE tree for $AC in table $table_w\n"; | |
} | |
$TREES{$AC."=".$TYPE} = 1; | |
} | |
} | |
} | |
print STDERR "Read in families with trees\n"; | |
return(\%TREES); | |
} | |
#------------------------------------------------------------------# | |
# NOW READ IN ALL THE FAMILIES IN THIS RELEASE OF TREEFAM, AND CHECK | |
# THAT EACH FAMILY HAS A SEED, CLEAN AND FULL TREE: | |
sub check_that_each_family_has_trees | |
{ | |
my $dbh = $_[0]; | |
my $TREES = $_[1]; | |
my $table_w; | |
my $st; | |
my $sth; | |
my $rv; | |
my @array; | |
my $i; | |
my %SEEN = (); | |
my $key; | |
my @temp; | |
for ($i = 1; $i <= 3; $i++) | |
{ | |
if ($i == 1) { $table_w = 'familyA'; } | |
elsif ($i == 2) { $table_w = 'familyB'; } | |
elsif ($i == 3) { $table_w = 'familyC'; } | |
$st = "SELECT AC from $table_w"; | |
$sth = $dbh->prepare($st) or die "Cannot prepare $st: $dbh->errstr\n"; | |
$rv = $sth->execute or die "Cannot execute the query: $sth->errstr"; | |
if ($rv >= 1) | |
{ | |
while ((@array) = $sth->fetchrow_array) | |
{ | |
$AC = $array[0]; | |
$SEEN{$AC} = 1; | |
# CHECK THAT THIS FAMILY HAS A SEED, CLEAN AND FULL TREE: | |
# NOTE THAT A TF3XXXXXX FAMILY SHOULDN'T HAVE SEED TREE: | |
if (substr($AC,0,3) eq 'TF1') | |
{ | |
if (!($TREES->{$AC."=SEED"})) | |
{ | |
print "WARNING: there is no SEED tree for $AC in table trees\n"; | |
} | |
} | |
if (!($TREES->{$AC."=CLEAN"})) | |
{ | |
print "WARNING: there is no CLEAN tree for $AC in table trees\n"; | |
} | |
if (!($TREES->{$AC."=FULL"})) | |
{ | |
print "WARNING: there is no FULL tree for $AC in table trees\n"; | |
} | |
} | |
} | |
} | |
# CHECK THAT THERE ARE NO TREES IN THE trees TABLES FOR FAMILIES | |
# THAT DO NOT APPEAR IN THE familyA, familyB or familyC TABLES: | |
foreach $key (keys %{$TREES}) | |
{ | |
@temp = split(/=/,$key); | |
$AC = $temp[0]; | |
if (!($SEEN{$AC})) | |
{ | |
print "WARNING: $AC appears in the trees table but not in familyA/familyB/familyC tables\n"; | |
} | |
} | |
} | |
#------------------------------------------------------------------# | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment