Skip to content

Instantly share code, notes, and snippets.

@avrilcoghlan
Created March 1, 2013 13:52
Show Gist options
  • Save avrilcoghlan/5064797 to your computer and use it in GitHub Desktop.
Save avrilcoghlan/5064797 to your computer and use it in GitHub Desktop.
Perl script that, given a file of Caenorhabditis elegans paralog pairs, finds the bootstrap value for the clade defined by the last common ancestor of the two paralogs.
#!/usr/local/bin/perl
#
#
# Perl script find_closest_worm_paralogs4.pl
# Written by Avril Coghlan ([email protected])
# 16-Feb-09.
#
# This perl script finds the bootstrap value for the
# clade containing a pair of C. elegans paralogs in a family (where
# there is no other gene in that clade, ie. they are each other's
# closest relatives in the family).
# The command-line format is:
# % perl find_closest_worm_paralogs4.pl <paralogs>
# where <paralogs> is the file of C. elegans paralog pairs.
#
#------------------------------------------------------------------#
# CHECK IF THERE ARE THE CORRECT NUMBER OF COMMAND-LINE ARGUMENTS:
$num_args = $#ARGV + 1;
if ($num_args != 1)
{
print "Usage of find_closest_worm_paralogs4.pl\n\n";
print "perl find_closest_worm_paralogs4.pl <paralogs>\n";
print "where <paralogs> is the file of C. elegans paralog pairs.\n";
print "For example, >perl find_closest_worm_paralogs4.pl Ce_closest_paralogs5b\n";
exit;
}
use lib "/home/bcri/acoghlan/x86_64-linux-thread-multi";
use Treefam::DBConnection; #xxx make sure it points to the right treefam release
use DBI;
# FIND THE INPUT FILE WITH THE C. ELEGANS PARALOG PAIRS:
$input = $ARGV[0];
#------------------------------------------------------------------#
# FIRST RECORD THE TREEFAM GENE IDS (GIDs) FOR TRANSCRIPT IDS, FOR
# C. ELEGANS GENES:
$release = 6; #xxx
$database = "dbi:mysql:treefam_".$release.":db.treefam.org:3308";
$dbh = DBI->connect("$database", 'anonymous', '') || return;
$GENE = &read_proteins_for_genes($dbh);
#------------------------------------------------------------------#
# CONNECT TO THE TREEFAM DATABASE:
$dbc = Treefam::DBConnection->new();
# READ IN THE INPUT FILE:
open(INPUT,"$input") || die "ERROR: cannot open $input\n";
while(<INPUT>)
{
$line = $_;
chomp $line;
@temp = split(/\s+/,$line);
$treefam_family = $temp[0];
$ce1 = $temp[1];
$ce2 = $temp[2];
# GET THE GENE IDs FOR THESE C. ELEGANS GENES:
if (!($GENE->{$ce1})) { print STDERR "ERROR: do not know gene id for $ce1\n"; exit;}
if (!($GENE->{$ce2})) { print STDERR "ERROR: do not know gene id for $ce2\n"; exit;}
$ce1_gene = $GENE->{$ce1};
$ce2_gene = $GENE->{$ce2};
print STDERR "Looking at family $treefam_family... \n";
$famh = $dbc->get_FamilyHandle();
if (!(defined($famh->get_by_id($treefam_family))))
{
print "WARNING: there is no family $treefam_family...\n";
goto NEXT_FAMILY;
}
$family = $famh->get_by_id($treefam_family);
if (!(defined($family->ID())))
{
print "WARNING: do not have ID stored for family $treefam_family...\n";
goto NEXT_FAMILY;
}
$AC = $family->ID(); # GET THE FAMILY ID.
if ($AC ne $treefam_family) { print STDERR "ERROR: AC $AC treefam_family $treefam_family\n"; exit;}
# GET THE TREE FOR THE FAMILY:
if (!(defined($family->get_tree('clean'))))
{
print "WARNING: AC $AC: there is no tree!\n";
goto NEXT_FAMILY;
}
$tree = $family->get_tree('clean'); # GET THE TREEFAM CLEAN TREE.
# GET ALL THE TRANSCRIPTS IN THIS FAMILY FROM SPECIES $the_species:
@leaves = $tree->get_leaves;
$leaf1 = "none";
$leaf2 = "none";
$num_ce_genes = 0;
foreach $leaf(@leaves)
{
# FIND THE GENE FOR THIS TRANSCRIPT:
$gh = $leaf->gene;
$gene = $gh->ID(); # eg. ENSMUSG00000022770 OR WBGene00020391
if ($gene eq $ce1_gene) { $leaf1 = $leaf;}
elsif ($gene eq $ce2_gene) { $leaf2 = $leaf;}
# CHECK IF THIS IS A CE. ELEGANS GENE:
if (defined($leaf->get_tag_value('S')))
{
$species = $leaf->get_tag_value('S');
if ($species eq 'CAEEL') { $num_ce_genes++;}
}
else
{
print STDERR "ERROR: AC $AC: do not know species of node $leaf\n";
exit;
}
}
if ($leaf1 eq 'none') { print STDERR "ERROR: did not find $ce1_gene ($ce1) in $AC tree\n"; exit;}
if ($leaf2 eq 'none') { print STDERR "ERROR: did not find $ce2_gene ($ce2) in $AC tree\n"; exit;}
# FIND THE TREE BOOTSTRAP VALUE FOR THE CLADE CONTAINING $ce1 AND $ce2:
if (defined($tree->get_last_common_ancestor($leaf1,$leaf2)))
{
$lca = $tree->get_last_common_ancestor($leaf1,$leaf2);
if (defined($lca->get_tag_value('B')))
{
$bootstrap = $lca->get_tag_value('B');
print "$AC $ce1 $ce1_gene and $ce2 $ce2_gene : bootstrap $bootstrap, $num_ce_genes Ce genes in tree\n";
}
else
{
print "WARNING: bootstrap value not defined for $ce1, $ce2 - $num_ce_genes Ce genes in tree\n";
goto NEXT_FAMILY;
}
}
else
{
print STDERR "ERROR: lca of $ce1 and $ce2 is not defined\n";
exit;
}
NEXT_FAMILY:
}
close(INPUT);
print STDERR "FINISHED\n";
#------------------------------------------------------------------#
# READ IN THE WORM PROTEINS CORRESPONDING TO EACH C. ELEGANS GENE:
sub read_proteins_for_genes
{
my $dbh = $_[0];
my $table_w;
my $st;
my $sth;
my $rv;
my @array;
my $GID;
my $ID;
my %GENE = ();
$table_w = 'genes';
$st = "SELECT GID, ID from $table_w WHERE TAX_ID=\'6239\'";
$sth = $dbh->prepare($st) or die "Cannot prepare $st: $dbh->errstr\n";
$rv = $sth->execute or die "Cannot execute the query: $sth->errstr";
if ($rv >= 1)
{
while ((@array) = $sth->fetchrow_array)
{
$GID = $array[0];
$ID = $array[1];
if ($GENE{$ID})
{
print STDERR "WARNING: have multiple gids for $ID\n";
$GENE{$ID} = $GID;
}
else
{
$GENE{$ID} = $GID;
}
}
}
print STDERR "Read in gene id for each protein id\n";
return(\%GENE);
}
#------------------------------------------------------------------#
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment