Created
March 1, 2013 16:52
-
-
Save avrilcoghlan/5066009 to your computer and use it in GitHub Desktop.
Perl script that, given a file with GO annotations for sequences in TreeFam families, and a list of families, infers the GO annotations for ancestral nodes in the trees for those families.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/local/bin/perl | |
# | |
# Perl script treefam_infer_ancestral_GOids.pl | |
# Written by Avril Coghlan ([email protected]) | |
# 2-Mar-07. | |
# | |
# For the TreeFam project. | |
# | |
# This perl script infers the GO ids of ancestral nodes | |
# in a list of TreeFam trees, given the GO ids of the | |
# leaves. | |
# | |
# The command-line format is: | |
# % perl <treefam_infer_ancestral_GOids.pl> GO families | |
# where GO has the GO ids of the leaves, | |
# families is the list of TreeFam families. | |
# | |
#------------------------------------------------------------------# | |
# CHECK IF THERE ARE THE CORRECT NUMBER OF COMMAND-LINE ARGUMENTS: | |
$num_args = $#ARGV + 1; | |
if ($num_args != 2) | |
{ | |
print "Usage of treefam_infer_ancestral_GOids.pl\n\n"; | |
print "perl treefam_infer_ancestral_GOids.pl <GO> <families>\n"; | |
print "where <GO> contains the GO ids of the leaves,\n"; | |
print " <families> is the list of TreeFam families.\n"; | |
print "For example, >perl treefam_infer_ancestral_GOids.pl GO_slim_terms11 neural_genes\n"; | |
exit; | |
} | |
# READ IN MY PERL MODULES: | |
BEGIN { | |
unshift (@INC, '/nfs/team71/phd/alc/perl/modules'); | |
} | |
use lib "/nfs/team71/phd/alc/perl/treefam/Treefam_api_2mar07/"; | |
use Avril_modules; | |
use Treefam::DBConnection; | |
# FIND THE NAME OF THE INPUT FILE OF GO TERMS ASSIGNED TO LEAVES: | |
$GO = $ARGV[0]; | |
# FIND THE NAME OF THE LIST OF TREEFAM FAMILIES: | |
$fams = $ARGV[1]; | |
$VERBOSE = 0; | |
#------------------------------------------------------------------# | |
# READ IN THE LIST OF FAMILIES: | |
%TAKE = (); | |
open(FAMS,"$fams") || die "ERROR: cannot open $fams.\n"; | |
while(<FAMS>) | |
{ | |
$line = $_; | |
chomp $line; | |
@temp = split(/\s+/,$line); | |
if (substr($line,0,2) eq 'TF') | |
{ | |
$family = $temp[0]; | |
$TAKE{$family} = 1; | |
} | |
} | |
close(FAMS); | |
# READ IN THE INPUT FILE OF GO TERMS: | |
($GOID,$families) = &read_go($GO,\%TAKE); | |
print STDERR "Read in file $GO...\n"; | |
# NOW INFER THE GO IDS OF ANCESTRAL NODES IN THESE FAMILIES: | |
&infer_ancestral_nodes($GOID,$families); | |
#------------------------------------------------------------------# | |
print STDERR "FINISHED.\n"; | |
#------------------------------------------------------------------# | |
# INFER THE GO IDS OF ANCESTRAL NODES IN THESE FAMILIES: | |
sub infer_ancestral_nodes | |
{ | |
my $GOID = $_[0]; # | |
my $families = $_[1]; # | |
my $i; # | |
my $family_id; # | |
my $tree; # | |
my $dbc; # | |
my $family; # | |
my $famh; # | |
my $leaf; # | |
my $leaf2; # | |
my $leaf_name; # | |
my $leaf2_name; # | |
my $leaf2_species; # | |
my $leaf_species; # | |
my $leaf2_id; # | |
my $leaf_id; # | |
my @temp; # | |
my $lca; # | |
my $lca_name; # | |
my $lca_type; # | |
my $parent; # | |
my $parent_name; # | |
my $parent_type; # | |
my $duplications1; # | |
my $duplications2; # | |
my $leaf_goids; # | |
my $leaf2_goids; # | |
my @leaf_goids; # | |
my @leaf2_goids; # | |
my $root; # | |
my $root_name; # | |
my $leaf_steps_to_root; # | |
my $leaf2_steps_to_root; # | |
my $k; # | |
my $m; # | |
my %SEEN_GOID = (); # | |
my @nodes; # | |
my $node; # | |
my $node_name; # | |
my $node_type; # | |
my $no_daughters; # | |
my $daughter1; # | |
my $daughter2; # | |
my $daughter1_name; # | |
my $daughter2_name; # | |
my $daughter; # | |
my $daughter_name; # | |
my @subnodes; # | |
my %SEEN_SUBNODE = (); # | |
my $subnode; # | |
my $subnode_name; # | |
my $new; # | |
my $p; # | |
my $subsubnode; # | |
# CONNECT TO THE TREEFAM DATABASE USING DEFAULT CONFIGURATION: | |
$dbc = Treefam::DBConnection->new(); | |
for ($i = 0; $i < @$families; $i++) | |
{ | |
$family_id = $families->[$i]; | |
# GET A FAMILY: | |
$famh = $dbc->get_FamilyHandle(); | |
$family = $famh->get_by_id($family_id); | |
if (!(defined($family))) | |
{ | |
print "Do not have $family_id in the database...\n"; | |
goto NEXT_FAMILY; | |
} | |
# GET THE CLEAN TREE FOR THIS FAMILY: | |
$tree = $family->get_tree('clean'); | |
# FIND THE ROOT OF THE TREE: | |
$root = $tree->root; | |
$root_name = $root->internalID; | |
# LIST ALL THE LEAVES FROM THE TREE: | |
foreach $leaf($tree->get_leaves) | |
{ | |
# COMPARE TO ALL OTHER LEAVES IN THE TREE: | |
foreach $leaf2($tree->get_leaves) | |
{ | |
# SEE IF $leaf AND $leaf2 HAVE GO IDS ASSIGNED: | |
$leaf_name = $leaf->id(); | |
$leaf2_name = $leaf2->id(); | |
$leaf_id = $leaf->get_tag_value('O'); | |
$leaf2_id = $leaf2->get_tag_value('O'); | |
@temp = split(/_/,$leaf_name); | |
$leaf_species = $temp[1]; | |
@temp = split(/_/,$leaf2_name); | |
$leaf2_species = $temp[1]; | |
# IF THESE LEAVES BELONG TO SPECIES THAT WE ARE INTERESTED IN: | |
if (($leaf_species eq 'DROME' || $leaf_species eq 'MOUSE' || $leaf_species eq 'RAT' || | |
$leaf_species eq 'CAEEL' || $leaf_species eq 'HUMAN' || $leaf_species eq 'BRARE' || | |
$leaf_species eq 'CHICK') && | |
($leaf2_species eq 'DROME' || $leaf2_species eq 'MOUSE' || $leaf2_species eq 'RAT' || | |
$leaf2_species eq 'CAEEL' || $leaf2_species eq 'HUMAN' || $leaf2_species eq 'BRARE' || | |
$leaf2_species eq 'CHICK')) | |
{ | |
# IF BOTH LEAVES HAVE GO IDS ANNOTATED: | |
if ($GOID->{$leaf_id} && $GOID->{$leaf2_id} && $leaf_id ne $leaf2_id) | |
{ | |
# FIND THE LAST COMMON ANCESTOR NODE OF $leaf AND $leaf2: | |
$lca = $tree->get_last_common_ancestor($leaf,$leaf2); | |
$lca_name= $lca->internalID; | |
$lca_type = $lca->get_tag_value('D'); | |
if ($lca_type eq 'Y') # THE ANCESTRAL NODE IS A DUPLICATION: | |
{ | |
$duplications1 = 1; $duplications2 = 1; | |
$leaf_steps_to_root = "-1"; $leaf2_steps_to_root = -1; | |
goto FOUND_ROOT2; | |
} | |
# SEE IF THERE ARE ANY DUPLICATION NODES BETWEEN $leaf AND $lca: | |
$duplications1 = 0; | |
if (defined($leaf->parent)) { $parent = $leaf->parent; } | |
else { print STDERR "ERROR: do not know parent of leaf $leaf_id.\n"; exit;} | |
# $parent_type HAS VALUE Y FOR DUPLICATION, N FOR SPECIATION. | |
$parent_type = $parent->get_tag_value('D'); | |
if ($parent_type eq 'Y') { $duplications1 = 1;} | |
$parent_name = $parent->internalID; | |
if ($parent_name eq $lca_name) { goto FOUND_LCA1;} | |
FIND_PARENT1: | |
if (defined($parent->parent)) | |
{ | |
$parent = $parent->parent; | |
$parent_type = $parent->get_tag_value('D'); | |
if ($parent_type eq 'Y') { $duplications1 = 1;} | |
$parent_name = $parent->internalID; | |
if ($parent_name eq $lca_name) { goto FOUND_LCA1;} | |
goto FIND_PARENT1; | |
} | |
FOUND_LCA1: | |
# FIND HOW MANY STEPS THERE ARE BETWEEN $leaf AND THE ROOT $root: | |
$leaf_steps_to_root = 0; | |
if (defined($leaf->parent)) { $parent = $leaf->parent; } | |
else { print STDERR "ERROR: do not know parent of leaf $leaf_id.\n"; exit;} | |
$parent_type = $parent->get_tag_value('D'); | |
if ($parent_type eq 'N') { $leaf_steps_to_root++;} | |
$parent_name = $parent->internalID; | |
if ($parent_name eq $root_name) { goto FOUND_ROOT1;} | |
FIND_ROOT1: | |
if (defined($parent->parent)) | |
{ | |
$parent = $parent->parent; | |
$parent_type = $parent->get_tag_value('D'); | |
if ($parent_type eq 'N') { $leaf_steps_to_root++;} | |
$parent_name = $parent->internalID; | |
if ($parent_name eq $root_name) { goto FOUND_ROOT1;} | |
goto FIND_ROOT1; | |
} | |
FOUND_ROOT1: | |
# SEE IF THERE ARE ANY DUPLICATION NODES BETWEEN $leaf2 AND $lca: | |
$duplications2 = 0; | |
if (defined($leaf2->parent)) { $parent = $leaf2->parent; } | |
else { print STDERR "ERROR: do not know parent of leaf2 $leaf2_id.\n"; exit;} | |
# $parent_type HAS VALUE Y FOR DUPLICATION, N FOR SPECIATION. | |
$parent_type = $parent->get_tag_value('D'); | |
if ($parent_type eq 'Y') { $duplications2 = 1;} | |
$parent_name = $parent->internalID; | |
if ($parent_name eq $lca_name) { goto FOUND_LCA2;} | |
FIND_PARENT2: | |
if (defined($parent->parent)) | |
{ | |
$parent = $parent->parent; | |
$parent_type = $parent->get_tag_value('D'); | |
if ($parent_type eq 'Y') { $duplications2 = 1;} | |
$parent_name = $parent->internalID; | |
if ($parent_name eq $lca_name) { goto FOUND_LCA2;} | |
goto FIND_PARENT2; | |
} | |
FOUND_LCA2: | |
# FIND HOW MANY STEPS THERE ARE BETWEEN $leaf2 AND THE ROOT $root: | |
$leaf2_steps_to_root = 0; | |
if (defined($leaf2->parent)) { $parent = $leaf2->parent; } | |
else { print STDERR "ERROR: do not know parent of leaf2 $leaf2_id.\n"; exit;} | |
$parent_type = $parent->get_tag_value('D'); | |
if ($parent_type eq 'N') { $leaf2_steps_to_root++;} | |
$parent_name = $parent->internalID; | |
if ($parent_name eq $root_name) { goto FOUND_ROOT2;} | |
FIND_ROOT2: | |
if (defined($parent->parent)) | |
{ | |
$parent = $parent->parent; | |
$parent_type = $parent->get_tag_value('D'); | |
if ($parent_type eq 'N') { $leaf2_steps_to_root++;} | |
$parent_name = $parent->internalID; | |
if ($parent_name eq $root_name) { goto FOUND_ROOT2;} | |
goto FIND_ROOT2; | |
} | |
FOUND_ROOT2: | |
# IF THERE ARE NO DUPLICATIONS, WE CAN INFER THAT THE NODE $lca | |
# SHARES THE GO IDS THAT $leaf1 AND $leaf2 HAVE IN COMMON: | |
if ($VERBOSE == 1) | |
{ | |
print "\n$family_id $leaf_id $GOID->{$leaf_id} $leaf2_id $GOID->{$leaf2_id}\n"; | |
print "$family_id duplications1 $duplications1 duplications2 $duplications2 lca_type $lca_type lca_name $lca_name\n"; | |
print "$family_id leaf_steps_to_root $leaf_steps_to_root leaf2_steps_to_root $leaf2_steps_to_root\n"; | |
} | |
if (($duplications1 == 0 && $duplications2 == 0) || # THERE ARE NO DUPLICATIONS BETWEEN THE GENES | |
($duplications1 == 1 && $duplications2 == 0 && | |
$leaf_steps_to_root > $leaf2_steps_to_root) || # THERE ARE DUPLICATIONS BETWEEN | |
# LCA AND $leaf, BUT $leaf2 IS AN OUTGROUP | |
($duplications1 == 0 && $duplications2 == 1 && | |
$leaf_steps_to_root < $leaf2_steps_to_root)) # THERE ARE DUPLICATIONS BETWEEN | |
# LCA AND $leaf2, BUT $leaf IS AN OUTGROUP | |
{ | |
$leaf_goids = $GOID->{$leaf_id}; | |
$leaf2_goids= $GOID->{$leaf2_id}; | |
@leaf_goids = split(/\,/,$leaf_goids); | |
@leaf2_goids= split(/\,/,$leaf2_goids); | |
# CHECK WHETHER THE TWO SEQUENCES HAVE ANY GO IDS IN COMMON: | |
for ($k = 0; $k <= $#leaf_goids; $k++) | |
{ | |
for ($m = 0; $m <= $#leaf2_goids; $m++) | |
{ | |
if ($leaf_goids[$k] eq $leaf2_goids[$m]) | |
{ | |
if (!($SEEN_GOID{$family_id."=".$lca_name."=".$leaf_goids[$k]})) | |
{ | |
$SEEN_GOID{$family_id."=".$lca_name."=".$leaf_goids[$k]} = 1; | |
print "$family_id: assigning $leaf_goids[$k] to node $lca_name ($leaf_id $leaf2_id)\n"; | |
if (!($GOID->{$family_id."=".$lca_name})) { $GOID->{$family_id."=".$lca_name} = $leaf_goids[$k];} | |
else { $GOID->{$family_id."=".$lca_name} = $GOID->{$family_id."=".$lca_name}.",".$leaf_goids[$k];} | |
} | |
} | |
} | |
} | |
} | |
} | |
} | |
} | |
} | |
# GET ALL THE NODES IN THE TREE: | |
print "Getting all nodes in tree for $family_id...\n"; | |
@nodes = $tree->get_all_nodes; | |
foreach $node(@nodes) | |
{ | |
$node_name = $node->internalID; | |
if (!($node->is_leaf)) # IF THIS IS NOT A SEQUENCE. | |
{ | |
$node_type = $node->get_tag_value('D'); | |
if ($node_type eq 'Y') # THE NODE IS A DUPLICATION: | |
{ | |
# GET THE TWO DAUGHTER NODES OF $node: | |
$no_daughters = 0; | |
$daughter1 = ""; | |
$daughter2 = ""; | |
foreach $daughter ($node->children) | |
{ | |
$no_daughters++; | |
$daughter_name = $daughter->internalID; | |
if ($no_daughters == 1) { $daughter1_name = $daughter_name;} | |
elsif ($no_daughters == 2) { $daughter2_name = $daughter_name;} | |
@subnodes = &Avril_modules::empty_array(@subnodes); | |
@subnodes = (@subnodes,$daughter); | |
%SEEN_SUBNODE = (); | |
$SEEN_SUBNODE{$daughter} = 1; | |
# GET ALL THE DESCENDANTS OF $daughter: | |
LOOP_AGAIN1: | |
$new = 0; | |
for ($p = 0; $p <= $#subnodes; $p++) | |
{ | |
$subnode = $subnodes[$p]; | |
if (!($subnode->is_leaf)) | |
{ | |
foreach $subsubnode ($subnode->children) | |
{ | |
if (!($SEEN_SUBNODE{$subsubnode})) | |
{ | |
$SEEN_SUBNODE{$subsubnode} = 1; | |
$new++; | |
@subnodes = (@subnodes,$subsubnode); | |
} | |
} | |
} | |
} | |
if ($new > 0) { goto LOOP_AGAIN1;} | |
# FIND ALL THE GO IDS OF THE DESCENDANTS OF $daughter: | |
for ($p = 0; $p <= $#subnodes; $p++) | |
{ | |
$subnode = $subnodes[$p]; | |
if (!($subnode->is_leaf)) # IF IT IS NOT A LEAF NODE. | |
{ | |
$subnode_name = $subnode->internalID; | |
$subnode_name = $family_id."=".$subnode_name; | |
# CHECK IF THERE ARE GO SLIM IDS FOR THIS NODE: | |
if ($GOID->{$subnode_name}) | |
{ | |
print "$family_id = node $node_name => $daughter_name => $subnode_name: go $GOID->{$subnode_name}\n"; | |
if ($no_daughters == 1) { $daughter1 = $daughter1.",".$GOID->{$subnode_name};} | |
elsif ($no_daughters == 2) { $daughter2 = $daughter2.",".$GOID->{$subnode_name};} | |
} | |
} | |
} | |
} | |
# COMPARE THE GO IDS FOR $daughter1 AND $daughter2: | |
if ($daughter1 ne '' && $daughter2 ne '') | |
{ | |
&compare_goids($daughter1,$daughter2,$node_name,$family_id,$daughter1_name,$daughter2_name); | |
} | |
} | |
} | |
} | |
NEXT_FAMILY: | |
} | |
} | |
#------------------------------------------------------------------# | |
# COMPARE THE GO IDS FOR $daughter1 AND $daughter2: | |
sub compare_goids | |
{ | |
my $daughter1 = $_[0]; | |
my $daughter2 = $_[1]; | |
my $node_name = $_[2]; | |
my $family_id = $_[3]; | |
my $daughter1_name = $_[4]; | |
my $daughter2_name = $_[5]; | |
my @ids1; | |
my @ids2; | |
my %ID1 = (); | |
my $i; | |
my $same = 1; | |
my %SEEN = (); | |
$daughter1 = substr($daughter1,1,length($daughter1)-1); | |
$daughter2 = substr($daughter2,1,length($daughter2)-1); | |
@ids1 = split(/\,/,$daughter1); | |
@ids2 = split(/\,/,$daughter2); | |
%SEEN = (); | |
$daughter1 = ""; | |
for ($i = 0; $i <= $#ids1; $i++) | |
{ | |
if (!($SEEN{$ids1[$i]})) { $daughter1 = $daughter1.",".$ids1[$i]; $SEEN{$ids1[$i]} = 1;} | |
} | |
$daughter1 = substr($daughter1,1,length($daughter1)-1); | |
%SEEN = (); | |
$daughter2 = ""; | |
for ($i = 0; $i <= $#ids2; $i++) | |
{ | |
if (!($SEEN{$ids2[$i]})) { $daughter2 = $daughter2.",".$ids2[$i]; $SEEN{$ids2[$i]} = 1;} | |
} | |
$daughter2 = substr($daughter2,1,length($daughter2)-1); | |
@ids1 = &Avril_modules::empty_array(@ids1); | |
@ids2 = &Avril_modules::empty_array(@ids2); | |
@ids1 = split(/\,/,$daughter1); | |
@ids2 = split(/\,/,$daughter2); | |
for ($i = 0; $i <= $#ids1; $i++) { $ID1{$ids1[$i]} = 1;} | |
for ($i = 0; $i <= $#ids2; $i++) { if (!($ID1{$ids2[$i]})) { $same = 0;}} | |
if ($same == 0) | |
{ | |
print "$family_id node $node_name : daughter1 $daughter1 ($daughter1_name) and daughter2 $daughter2 ($daughter2_name) have different GO ids! ***\n"; | |
} | |
else | |
{ | |
print "$family_id node $node_name : daughter1 $daughter1 and daughter2 $daughter2 have same GO ids.\n"; | |
} | |
} | |
#------------------------------------------------------------------# | |
# READ IN THE INPUT FILE OF GO TERMS: | |
sub read_go | |
{ | |
my $file = $_[0]; | |
my $TAKE = $_[1]; | |
my $line; | |
my @temp; | |
my $family; | |
my $species; | |
my $goid; | |
my %GOID = (); | |
my %SEEN = (); | |
my @families; | |
my $name; | |
open(FILE,"$file") || die "ERROR: cannot open $file.\n"; | |
while(<FILE>) | |
{ | |
$line = $_; | |
chomp $line; | |
@temp = split(/\s+/,$line); | |
if (substr($line,0,6) eq 'Family') # Family TF101001 : DM CYCB has GO id GO:0000910 | |
{ | |
$family = $temp[1]; | |
$species = $temp[3]; | |
$goid = $temp[8]; | |
$name = $temp[9]; | |
#if ($species eq 'DM') { $name = $name."_DROME";} | |
#elsif ($species eq 'HS') { $name = $name."_HUMAN";} | |
#elsif ($species eq 'MM') { $name = $name."_MOUSE";} | |
#elsif ($species eq 'RN') { $name = $name."_RAT"; } | |
#elsif ($species eq 'DR') { $name = $name."_BRARE";} | |
#elsif ($species eq 'CE') { $name = $name."_CAEEL";} | |
#elsif ($species eq 'GG') { $name = $name."_CHICK";} | |
if (!($GOID{$name})) { $GOID{$name} = $goid;} | |
else {$GOID{$name} = $GOID{$name}.",".$goid;} | |
if (!($SEEN{$family}) && $TAKE->{$family}) | |
{ | |
$SEEN{$family} = 1; | |
@families = (@families,$family); | |
} | |
} | |
} | |
close(FILE); | |
return(\%GOID,\@families); | |
} | |
#------------------------------------------------------------------# | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment