Created
March 1, 2013 14:29
-
-
Save avrilcoghlan/5065002 to your computer and use it in GitHub Desktop.
Perl script that retrieves orthology bootstrap values for ortholog pairs for a particular pair of species from the TreeFam database, and stores the orthology bootstrap values in a Perl pickle
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/local/bin/perl | |
# | |
# Perl script store_treefam_orthostrap.pl | |
# Written by Avril Coghlan ([email protected]) | |
# 4-Apr-09. | |
# | |
# For the TreeFam project. | |
# | |
# This perl script connects to the TreeFam database and stores the orthology | |
# bootstrap values for orthologs for a particular pair of species in a pickle. | |
# | |
# The command-line format is: | |
# % perl <store_treefam_orthostrap.pl> version species1 species2 | |
# where version is the version of the TreeFam database to use, | |
# species1 is the first species of interest, | |
# species2 is the second species of interest. | |
# | |
#------------------------------------------------------------------# | |
# CHECK IF THERE ARE THE CORRECT NUMBER OF COMMAND-LINE ARGUMENTS: | |
$num_args = $#ARGV + 1; | |
if ($num_args != 3) | |
{ | |
print "Usage of store_treefam_orthostrap.pl\n\n"; | |
print "perl store_treefam_orthostrap.pl <version> <species1> <species2>\n"; | |
print "where <version> is the version of the TreeFam database to use,\n"; | |
print " <species1> is the first species of interest,\n"; | |
print " <species2> is the second species of interest.\n"; | |
print "For example, >perl -w store_treefam_orthostrap.pl 7 BRUMA CAEEL\n"; | |
exit; | |
} | |
# FIND THE RELEASE OF TREEFAM TO USE: | |
$version = $ARGV[0]; | |
# FIND THE FIRST SPECIES OF INTEREST: | |
$species1 = $ARGV[1]; | |
# FIND THE SECOND SPECIES OF INTEREST: | |
$species2 = $ARGV[2]; | |
# READ IN MY PERL MODULES: | |
use Avril_modules; | |
use Treefam::DBConnection; | |
use DBI; | |
use Storable; | |
$VERBOSE = 0; | |
#------------------------------------------------------------------# | |
# GET THE TAXIDS FOR THE SPECIES: | |
$file1 = "treefam.".$version."_taxid"; | |
($TAXID) = retrieve($file1); | |
if (!($TAXID->{$species1})) { print STDERR "ERROR: do not know taxid for $species1\n"; exit;} | |
$taxid1 = $TAXID->{$species1}; | |
if (!($TAXID->{$species2})) { print STDERR "ERROR: do not know taxid for $species2\n"; exit;} | |
$taxid2 = $TAXID->{$species2}; | |
# GET THE TAXID FOR THEIR LAST COMMON ANCESTOR NODE: | |
$lca_taxid = &get_taxid_of_lca($taxid1,$taxid2); | |
#------------------------------------------------------------------# | |
# READ IN THE GENE IDs CORRESPONDING TO IDX IDs: | |
$file1 = "treefam.".$version."_idx".$species1; | |
($SPECIES1_ID) = retrieve($file1); | |
$file2 = "treefam.".$version."_idx".$species2; | |
($SPECIES2_ID) = retrieve($file2); | |
#------------------------------------------------------------------# | |
# READ IN ALL THE ORTHOLOGY BOOTSTRAP VALUES FROM THE 'orthologs' TABLE: | |
# NOW FIND ALL $species1-$species2 ORTHOLOGS: | |
$database = 'treefam_'.$version; | |
$dbh = DBI->connect("dbi:mysql:$database:db.treefam.org:3308", 'anonymous', '') || return; | |
$table_w = 'ortholog'; | |
$cnt = 0; | |
# THE COLUMNS IN THE TABLE ARE IDX1, IDX2, ORTHOLOGY BOOTSTRAP: | |
%ORTHOSTRAP = (); | |
$st = "SELECT idx1, idx2, bootstrap from $table_w WHERE taxon_id=$lca_taxid"; | |
$sth = $dbh->prepare($st) or die "Cannot prepare $st: $dbh->errstr\n"; | |
$rv = $sth->execute or die "Cannot execute the query: $sth->errstr"; | |
if ($rv >= 1) | |
{ | |
while ((@array) = $sth->fetchrow_array) | |
{ | |
$IDX1 = $array[0]; | |
$IDX2 = $array[1]; | |
$orthostrap = $array[2]; | |
$ID1 = "none"; | |
$ID2 = "none"; | |
if ($SPECIES1_ID->{$IDX1} && $SPECIES2_ID->{$IDX2}) | |
{ | |
$ID1 = $SPECIES1_ID->{$IDX1}; | |
$ID2 = $SPECIES2_ID->{$IDX2}; | |
} | |
elsif ($SPECIES1_ID->{$IDX2} && $SPECIES2_ID->{$IDX1}) | |
{ | |
$ID1 = $SPECIES1_ID->{$IDX2}; | |
$ID2 = $SPECIES2_ID->{$IDX1}; | |
} | |
if ($ID1 ne 'none' && $ID2 ne 'none') | |
{ | |
$cnt++; | |
print "Storing orthostrap for $cnt: $ID1 and $ID2\n"; | |
$pair1 = $ID1."_".$ID2; | |
$pair2 = $ID2."_".$ID1; | |
if ($ORTHOSTRAP{$pair1}) | |
{ | |
$the_orthostrap= $ORTHOSTRAP{$pair1}; | |
if ($the_orthostrap eq 'zero') { $the_orthostrap = 0;} | |
# THIS SHOULDN'T REALLY HAPPEN BUT IN TREEFAM-4, HAD SEVERAL TRANSCRIPTS OF ONE GENE IN A TREE: | |
if ($the_orthostrap ne $orthostrap) | |
{ | |
if ($orthostrap < $the_orthostrap) | |
{ | |
$orthostrap = $the_orthostrap; | |
} | |
} | |
} | |
if ($ORTHOSTRAP{$pair2}) | |
{ | |
$the_orthostrap = $ORTHOSTRAP{$pair2}; | |
if ($the_orthostrap eq 'zero') { $the_orthostrap = 0;} | |
# THIS SHOULDN'T REALLY HAPPEN BUT IN TREEFAM-4, HAD SEVERAL TRANSCRIPTS OF ONE GENE IN A TREE: | |
if ($the_orthostrap ne $orthostrap) | |
{ | |
if ($orthostrap < $the_orthostrap) | |
{ | |
$orthostrap = $the_orthostrap; | |
} | |
} | |
} | |
if ($orthostrap == 0) { $orthostrap = "zero";} | |
$ORTHOSTRAP{$pair1} = $orthostrap; | |
$ORTHOSTRAP{$pair2} = $orthostrap; | |
} | |
} | |
} | |
print STDERR "Read in orthology bootstrap values for $species1 and $species2 genes\n"; | |
#------------------------------------------------------------------# | |
# STORE THE HASH TABLE %ORTHOSTRAP IN A PICKLE: | |
$output1 = "treefam.".$version."_orthostrap".$species1."_".$species2; | |
store \%ORTHOSTRAP,$output1; | |
#------------------------------------------------------------------# | |
print STDERR "FINISHED.\n"; | |
#------------------------------------------------------------------# | |
sub get_taxid_of_lca | |
{ | |
my $taxid1 = $_[0]; | |
my $taxid2 = $_[1]; | |
my $taxid_lca = "none"; | |
my $table_w; | |
my %ANCESTOR1 = (); | |
my $taxid; | |
my $ancestor; | |
my $st; | |
my $sth; | |
my $rv; | |
my @array; | |
$database = 'treefam_'.$version; | |
$dbh = DBI->connect("dbi:mysql:$database:db.treefam.org:3308", 'anonymous', '') || return; | |
$table_w = 'spec_nodes'; | |
$taxid = $taxid1; | |
LOOP_AGAIN1: | |
# FIND THE ANCESTORS OF $taxid: | |
$st = "SELECT parent_id from $table_w WHERE taxon_id=$taxid"; | |
$sth = $dbh->prepare($st) or die "Cannot prepare $st: $dbh->errstr\n"; | |
$rv = $sth->execute or die "Cannot execute the query: $sth->errstr"; | |
if ($rv >= 1) | |
{ | |
while ((@array) = $sth->fetchrow_array) | |
{ | |
$ancestor = $array[0]; | |
$ANCESTOR1{$ancestor} = 1; | |
$taxid = $ancestor; | |
goto LOOP_AGAIN1; | |
} | |
} | |
$taxid = $taxid2; | |
LOOP_AGAIN2: | |
# FIND THE ANCESTORS OF $taxid: | |
$st = "SELECT parent_id from $table_w WHERE taxon_id=$taxid"; | |
$sth = $dbh->prepare($st) or die "Cannot prepare $st: $dbh->errstr\n"; | |
$rv = $sth->execute or die "Cannot execute the query: $sth->errstr"; | |
if ($rv >= 1) | |
{ | |
while ((@array) = $sth->fetchrow_array) | |
{ | |
$ancestor = $array[0]; | |
if ($ANCESTOR1{$ancestor}) { $taxid_lca = $ancestor; goto FOUND_LCA;} | |
$taxid = $ancestor; | |
goto LOOP_AGAIN2; | |
} | |
} | |
FOUND_LCA: | |
if ($taxid_lca eq 'none') { print STDERR "ERROR: taxid1 $taxid1 taxid2 $taxid2 taxid_lca $taxid_lca\n"; exit;} | |
return($taxid_lca); | |
} | |
#------------------------------------------------------------------# | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment