Created
March 1, 2013 13:52
-
-
Save avrilcoghlan/5064797 to your computer and use it in GitHub Desktop.
Perl script that, given a file of Caenorhabditis elegans paralog pairs, finds the bootstrap value for the clade defined by the last common ancestor of the two paralogs.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/local/bin/perl | |
# | |
# | |
# Perl script find_closest_worm_paralogs4.pl | |
# Written by Avril Coghlan ([email protected]) | |
# 16-Feb-09. | |
# | |
# This perl script finds the bootstrap value for the | |
# clade containing a pair of C. elegans paralogs in a family (where | |
# there is no other gene in that clade, ie. they are each other's | |
# closest relatives in the family). | |
# The command-line format is: | |
# % perl find_closest_worm_paralogs4.pl <paralogs> | |
# where <paralogs> is the file of C. elegans paralog pairs. | |
# | |
#------------------------------------------------------------------# | |
# CHECK IF THERE ARE THE CORRECT NUMBER OF COMMAND-LINE ARGUMENTS: | |
$num_args = $#ARGV + 1; | |
if ($num_args != 1) | |
{ | |
print "Usage of find_closest_worm_paralogs4.pl\n\n"; | |
print "perl find_closest_worm_paralogs4.pl <paralogs>\n"; | |
print "where <paralogs> is the file of C. elegans paralog pairs.\n"; | |
print "For example, >perl find_closest_worm_paralogs4.pl Ce_closest_paralogs5b\n"; | |
exit; | |
} | |
use lib "/home/bcri/acoghlan/x86_64-linux-thread-multi"; | |
use Treefam::DBConnection; #xxx make sure it points to the right treefam release | |
use DBI; | |
# FIND THE INPUT FILE WITH THE C. ELEGANS PARALOG PAIRS: | |
$input = $ARGV[0]; | |
#------------------------------------------------------------------# | |
# FIRST RECORD THE TREEFAM GENE IDS (GIDs) FOR TRANSCRIPT IDS, FOR | |
# C. ELEGANS GENES: | |
$release = 6; #xxx | |
$database = "dbi:mysql:treefam_".$release.":db.treefam.org:3308"; | |
$dbh = DBI->connect("$database", 'anonymous', '') || return; | |
$GENE = &read_proteins_for_genes($dbh); | |
#------------------------------------------------------------------# | |
# CONNECT TO THE TREEFAM DATABASE: | |
$dbc = Treefam::DBConnection->new(); | |
# READ IN THE INPUT FILE: | |
open(INPUT,"$input") || die "ERROR: cannot open $input\n"; | |
while(<INPUT>) | |
{ | |
$line = $_; | |
chomp $line; | |
@temp = split(/\s+/,$line); | |
$treefam_family = $temp[0]; | |
$ce1 = $temp[1]; | |
$ce2 = $temp[2]; | |
# GET THE GENE IDs FOR THESE C. ELEGANS GENES: | |
if (!($GENE->{$ce1})) { print STDERR "ERROR: do not know gene id for $ce1\n"; exit;} | |
if (!($GENE->{$ce2})) { print STDERR "ERROR: do not know gene id for $ce2\n"; exit;} | |
$ce1_gene = $GENE->{$ce1}; | |
$ce2_gene = $GENE->{$ce2}; | |
print STDERR "Looking at family $treefam_family... \n"; | |
$famh = $dbc->get_FamilyHandle(); | |
if (!(defined($famh->get_by_id($treefam_family)))) | |
{ | |
print "WARNING: there is no family $treefam_family...\n"; | |
goto NEXT_FAMILY; | |
} | |
$family = $famh->get_by_id($treefam_family); | |
if (!(defined($family->ID()))) | |
{ | |
print "WARNING: do not have ID stored for family $treefam_family...\n"; | |
goto NEXT_FAMILY; | |
} | |
$AC = $family->ID(); # GET THE FAMILY ID. | |
if ($AC ne $treefam_family) { print STDERR "ERROR: AC $AC treefam_family $treefam_family\n"; exit;} | |
# GET THE TREE FOR THE FAMILY: | |
if (!(defined($family->get_tree('clean')))) | |
{ | |
print "WARNING: AC $AC: there is no tree!\n"; | |
goto NEXT_FAMILY; | |
} | |
$tree = $family->get_tree('clean'); # GET THE TREEFAM CLEAN TREE. | |
# GET ALL THE TRANSCRIPTS IN THIS FAMILY FROM SPECIES $the_species: | |
@leaves = $tree->get_leaves; | |
$leaf1 = "none"; | |
$leaf2 = "none"; | |
$num_ce_genes = 0; | |
foreach $leaf(@leaves) | |
{ | |
# FIND THE GENE FOR THIS TRANSCRIPT: | |
$gh = $leaf->gene; | |
$gene = $gh->ID(); # eg. ENSMUSG00000022770 OR WBGene00020391 | |
if ($gene eq $ce1_gene) { $leaf1 = $leaf;} | |
elsif ($gene eq $ce2_gene) { $leaf2 = $leaf;} | |
# CHECK IF THIS IS A CE. ELEGANS GENE: | |
if (defined($leaf->get_tag_value('S'))) | |
{ | |
$species = $leaf->get_tag_value('S'); | |
if ($species eq 'CAEEL') { $num_ce_genes++;} | |
} | |
else | |
{ | |
print STDERR "ERROR: AC $AC: do not know species of node $leaf\n"; | |
exit; | |
} | |
} | |
if ($leaf1 eq 'none') { print STDERR "ERROR: did not find $ce1_gene ($ce1) in $AC tree\n"; exit;} | |
if ($leaf2 eq 'none') { print STDERR "ERROR: did not find $ce2_gene ($ce2) in $AC tree\n"; exit;} | |
# FIND THE TREE BOOTSTRAP VALUE FOR THE CLADE CONTAINING $ce1 AND $ce2: | |
if (defined($tree->get_last_common_ancestor($leaf1,$leaf2))) | |
{ | |
$lca = $tree->get_last_common_ancestor($leaf1,$leaf2); | |
if (defined($lca->get_tag_value('B'))) | |
{ | |
$bootstrap = $lca->get_tag_value('B'); | |
print "$AC $ce1 $ce1_gene and $ce2 $ce2_gene : bootstrap $bootstrap, $num_ce_genes Ce genes in tree\n"; | |
} | |
else | |
{ | |
print "WARNING: bootstrap value not defined for $ce1, $ce2 - $num_ce_genes Ce genes in tree\n"; | |
goto NEXT_FAMILY; | |
} | |
} | |
else | |
{ | |
print STDERR "ERROR: lca of $ce1 and $ce2 is not defined\n"; | |
exit; | |
} | |
NEXT_FAMILY: | |
} | |
close(INPUT); | |
print STDERR "FINISHED\n"; | |
#------------------------------------------------------------------# | |
# READ IN THE WORM PROTEINS CORRESPONDING TO EACH C. ELEGANS GENE: | |
sub read_proteins_for_genes | |
{ | |
my $dbh = $_[0]; | |
my $table_w; | |
my $st; | |
my $sth; | |
my $rv; | |
my @array; | |
my $GID; | |
my $ID; | |
my %GENE = (); | |
$table_w = 'genes'; | |
$st = "SELECT GID, ID from $table_w WHERE TAX_ID=\'6239\'"; | |
$sth = $dbh->prepare($st) or die "Cannot prepare $st: $dbh->errstr\n"; | |
$rv = $sth->execute or die "Cannot execute the query: $sth->errstr"; | |
if ($rv >= 1) | |
{ | |
while ((@array) = $sth->fetchrow_array) | |
{ | |
$GID = $array[0]; | |
$ID = $array[1]; | |
if ($GENE{$ID}) | |
{ | |
print STDERR "WARNING: have multiple gids for $ID\n"; | |
$GENE{$ID} = $GID; | |
} | |
else | |
{ | |
$GENE{$ID} = $GID; | |
} | |
} | |
} | |
print STDERR "Read in gene id for each protein id\n"; | |
return(\%GENE); | |
} | |
#------------------------------------------------------------------# | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment