Created
March 1, 2013 16:54
-
-
Save avrilcoghlan/5066023 to your computer and use it in GitHub Desktop.
Perl script that, given a file with GO annotations for sequences in TreeFam families, and a list of families, infers the GO annotations for ancestral nodes in the trees for those families.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/local/bin/perl | |
# | |
# Perl script treefam_infer_ancestral_GOids3.pl | |
# Written by Avril Coghlan ([email protected]) | |
# 3-May-07. | |
# | |
# For the TreeFam project. | |
# | |
# This perl script infers the GO ids of ancestral nodes | |
# in a list of TreeFam trees, given the GO ids of the | |
# leaves. This is different from treefam_infer_ancestral_GOids.pl, | |
# because it uses the algorithm Richard designed in China. | |
# Note that this reads in the GO-Slim terms for the leaves of | |
# the tree. | |
# | |
# The command-line format is: | |
# % perl <treefam_infer_ancestral_GOids3.pl> GO family | |
# where GO has the GO ids of the leaves, | |
# families is the list of TreeFam families. | |
# | |
#------------------------------------------------------------------# | |
# CHECK IF THERE ARE THE CORRECT NUMBER OF COMMAND-LINE ARGUMENTS: | |
$num_args = $#ARGV + 1; | |
if ($num_args != 2) | |
{ | |
print "Usage of treefam_infer_ancestral_GOids3.pl\n\n"; | |
print "perl treefam_infer_ancestral_GOids3.pl <GO> <families>\n"; | |
print "where <GO> contains the GO ids of the leaves,\n"; | |
print " <families> is the list of TreeFam families.\n"; | |
print "For example, >perl treefam_infer_ancestral_GOids3.pl GO_slim_terms11_2may07\n"; | |
print "neural_genes_2may07\n"; | |
exit; | |
} | |
# READ IN MY PERL MODULES: | |
BEGIN { | |
unshift (@INC, '/nfs/team71/phd/alc/perl/modules'); | |
} | |
use lib "/nfs/team71/phd/alc/perl/treefam/Treefam_api_3may07/"; #xxx note that this is old, I've hacked it | |
#xxx to make it use TreeFam-4. | |
use Avril_modules; | |
use Treefam::DBConnection; | |
# FIND THE NAME OF THE INPUT FILE OF GO TERMS ASSIGNED TO LEAVES: | |
$GO = $ARGV[0]; | |
# FIND THE NAME OF THE LIST OF TREEFAM FAMILIES: | |
$fams = $ARGV[1]; | |
$VERBOSE = 1; | |
$VERBOSE2 = 1; | |
$VERBOSE3 = 1; | |
#------------------------------------------------------------------# | |
# READ IN THE LIST OF FAMILIES: | |
$TAKE = &read_family_list($fams); | |
# READ IN THE INPUT FILE OF GO TERMS: | |
($GOID,$families) = &read_go($GO,$TAKE); | |
print STDERR "Read in file $GO...\n"; | |
# NOW INFER THE GO IDS OF ANCESTRAL NODES IN THESE FAMILIES: | |
&infer_ancestral_nodes($GOID,$families); | |
#------------------------------------------------------------------# | |
print STDERR "FINISHED.\n"; | |
#------------------------------------------------------------------# | |
# READ IN THE LIST OF FAMILIES: | |
sub read_family_list | |
{ | |
my $fams = $_[0]; | |
my %TAKE = (); | |
my $line; | |
my @temp; | |
my $family; | |
open(FAMS,"$fams") || die "ERROR: cannot open $fams.\n"; | |
while(<FAMS>) | |
{ | |
$line = $_; | |
chomp $line; | |
@temp = split(/\s+/,$line); | |
if (substr($line,0,2) eq 'TF') | |
{ | |
$family = $temp[0]; | |
if ($family eq 'TF314159') { $TAKE{$family} = 1;} #xxx an interesting case. | |
#xxx $TAKE{$family} = 1; | |
} | |
} | |
close(FAMS); | |
return(\%TAKE); | |
} | |
#------------------------------------------------------------------# | |
# IF $node IS A DUPLICATION NODE, ONLY CONFIRM IN THE CHILDREN THOSE | |
# CONFIRMED TERMS IN THE CURRENT NODE THAT ARE PROVISIONAL TERMS IN | |
# THE CHILD NODE. | |
sub add_confirmed_terms_to_children2 | |
{ | |
my $node = $_[0]; # | |
my $node_name = $_[1]; # | |
my $CONFIRMED_GOID = $_[2]; # | |
my $VERBOSE_PREORDER = $_[3]; # | |
my $PROVISIONAL_GOID = $_[4]; # | |
my $GOID = $_[5]; # | |
my $confirmed_goid; # | |
my @confirmed_goid; # | |
my $daughter; # | |
my $daughter_name; # | |
my $provisional_goid; # | |
my @provisional_goid; # | |
my %CONFIRMED_IN_NODE = (); # | |
my $i; # | |
my $j; # | |
my $daughter_confirmed_goid; # | |
my %SEEN = (); # | |
my $daughter_seqname; # | |
if ($VERBOSE_PREORDER == 1) { print "___ adding confirmed goids for duplication node $node_name to its chidren's confirmed lists\n";} | |
# FIND THE CONFIRMED GOIDs OF $node: | |
if ($CONFIRMED_GOID->{$node_name}) | |
{ | |
$confirmed_goid = $CONFIRMED_GOID->{$node_name}; | |
@confirmed_goid = split(/\,/,$confirmed_goid); | |
for ($i = 0; $i <= $#confirmed_goid; $i++) { $CONFIRMED_IN_NODE{$confirmed_goid[$i]} = 1;} | |
} | |
else # $node DOES NOT HAVE ANY CONFIRMED GOIDs. | |
{ | |
return; | |
} | |
# FIND THE CHILDREN OF NODE $node: | |
if (!($node->is_leaf)) # IF THIS IS NOT A SEQUENCE. | |
{ | |
# GET THE TWO DAUGHTER NODES OF $node: | |
foreach $daughter ($node->children) | |
{ | |
$daughter_confirmed_goid = ""; | |
%SEEN = (); | |
if ($daughter->is_leaf) { $daughter_name = $daughter->id(); } | |
else { $daughter_name = $daughter->internalID;} | |
# FIND THE PROVISIONAL GOIDs FOR $daughter_name: | |
if ($PROVISIONAL_GOID->{$daughter_name}) | |
{ | |
$provisional_goid = $PROVISIONAL_GOID->{$daughter_name}; | |
@provisional_goid = split(/\,/,$provisional_goid); | |
for ($j = 0; $j <= $#provisional_goid; $j++) | |
{ | |
if ($CONFIRMED_IN_NODE{$provisional_goid[$j]}) # IF THIS GOID IS CONFIRMED IN $node. | |
{ | |
if (!($SEEN{$provisional_goid[$j]})) | |
{ | |
$SEEN{$provisional_goid[$j]} = 1; | |
$daughter_confirmed_goid = $daughter_confirmed_goid.",".$provisional_goid[$j]; | |
} | |
} | |
} | |
} | |
else # THERE IS NO PROVISIONAL GO ID $daughter_name. $daughter_name MAY BE A LEAF. | |
{ | |
if ($daughter->is_leaf) | |
{ | |
$daughter_seqname = $daughter->get_tag_value('O'); | |
if ($GOID->{$daughter_seqname}) | |
{ | |
$provisional_goid = $PROVISIONAL_GOID->{$daughter_name}; | |
@provisional_goid = split(/\,/,$provisional_goid); | |
for ($j = 0; $j <= $#provisional_goid; $j++) | |
{ | |
if ($CONFIRMED_IN_NODE{$provisional_goid[$j]}) # IF THIS GOID IS CONFIRMED IN $node. | |
{ | |
if (!($SEEN{$provisional_goid[$j]})) | |
{ | |
$SEEN{$provisional_goid[$j]} = 1; | |
$daughter_confirmed_goid = $daughter_confirmed_goid.",".$provisional_goid[$j]; | |
} | |
} | |
} | |
} | |
} | |
} | |
if ($daughter_confirmed_goid ne "") | |
{ | |
$daughter_confirmed_goid = substr($daughter_confirmed_goid,1,length($daughter_confirmed_goid)-1); | |
if ($VERBOSE_PREORDER == 1) { print "___ adding $daughter_confirmed_goid to daughter $daughter_name of $node_name\n";} | |
if (!($CONFIRMED_GOID->{$daughter_name})) { $CONFIRMED_GOID->{$daughter_name} = $daughter_confirmed_goid;} | |
else { print STDERR "ERROR: daughter_name $daughter_name should not have a confirmed goid already.\n"; exit;} | |
} | |
} | |
} | |
} | |
#------------------------------------------------------------------# | |
# IF $node IS A SPECIATION NODE, ADD ALL ITS CONFIRMED TERMS TO THE | |
# CONFIRMED LISTS OF BOTH ITS CHILDREN: | |
sub add_confirmed_terms_to_children | |
{ | |
my $node = $_[0]; | |
my $node_name = $_[1]; | |
my $CONFIRMED_GOID = $_[2]; | |
my $VERBOSE_PREORDER = $_[3]; | |
my $daughter; | |
my $daughter_name; | |
my $confirmed_goid; | |
if ($VERBOSE_PREORDER == 1) { print "___ adding confirmed goids for speciation node $node_name to its chidren's confirmed lists\n";} | |
# FIND THE CONFIRMED GOIDs OF $node: | |
if ($CONFIRMED_GOID->{$node_name}) | |
{ | |
$confirmed_goid = $CONFIRMED_GOID->{$node_name}; | |
} | |
else # $node DOES NOT HAVE ANY CONFIRMED GOIDs. | |
{ | |
return; | |
} | |
# FIND THE CHILDREN OF NODE $node: | |
if (!($node->is_leaf)) # IF THIS IS NOT A SEQUENCE. | |
{ | |
# GET THE TWO DAUGHTER NODES OF $node: | |
foreach $daughter ($node->children) | |
{ | |
if ($daughter->is_leaf) { $daughter_name = $daughter->id(); } | |
else { $daughter_name = $daughter->internalID;} | |
if ($VERBOSE_PREORDER == 1) { print "___ adding $confirmed_goid to daughter $daughter_name of $node_name\n";} | |
if (!($CONFIRMED_GOID->{$daughter_name})) { $CONFIRMED_GOID->{$daughter_name} = $confirmed_goid;} | |
else { print STDERR "ERROR: daughter_name $daughter_name should not have a confirmed goid already.\n"; exit;} | |
} | |
} | |
} | |
#------------------------------------------------------------------# | |
# RECORD THE GO TERMS FOR NODE $node. IN THE PRE-ORDER TRAVERSAL, | |
# WE ADD TO THE CONFIRMED TERM LIST OF A NODE THOSE PROVISIONAL | |
# TERMS PRESENT IN BOTH CHILDREN OF THE CURRENT NODE: | |
sub assign_confirmed_go_terms | |
{ | |
my $PROVISIONAL_GOID = $_[0]; # | |
my $node = $_[1]; # | |
my $node_name = $_[2]; # | |
my $CONFIRMED_GOID = $_[3]; # | |
my $VERBOSE_PREORDER = $_[4]; # | |
my $GOID = $_[5]; # | |
my $daughter; # | |
my $daughter_name; # | |
my $daughter_seqname; # | |
my $daughter_goids; # | |
my @daughter_goids; # | |
my $i; # | |
my $no_daughters = 0; # | |
my %DAUGHTER_GOID = (); # | |
my %SEEN_GOID = (); # | |
my @all_goids; # | |
my $goid; # | |
my $confirmed_goid = ""; # | |
my $shared_goid; # | |
my $j; # | |
if ($VERBOSE_PREORDER == 1) { print "___ finding confirmed goids for node $node_name\n";} | |
# FIND THE CHILDREN OF NODE $node: | |
if (!($node->is_leaf)) # IF THIS IS NOT A SEQUENCE. | |
{ | |
# GET THE TWO DAUGHTER NODES OF $node: | |
foreach $daughter ($node->children) | |
{ | |
$no_daughters++; | |
if ($daughter->is_leaf) { $daughter_name = $daughter->id(); } | |
else { $daughter_name = $daughter->internalID;} | |
if ($VERBOSE_PREORDER == 1) { print "___ looking at daughter $daughter_name of parent $node_name\n";} | |
# FIND THE GO TERMS OF NODE $daughter: | |
if ($daughter->is_leaf) | |
{ | |
$daughter_seqname = $daughter->get_tag_value('O'); | |
if ($GOID->{$daughter_seqname}) | |
{ | |
$daughter_goids = $GOID->{$daughter_seqname}; | |
if ($VERBOSE_PREORDER == 1) { print "___ daughter $daughter_name ($daughter_seqname) has goids $daughter_goids\n";} | |
@daughter_goids = split(/\,/,$daughter_goids); | |
for ($i = 0; $i <= $#daughter_goids; $i++) | |
{ | |
$DAUGHTER_GOID{$no_daughters."_".$daughter_goids[$i]} = 1; | |
if (!($SEEN_GOID{$daughter_goids[$i]})) | |
{ | |
$SEEN_GOID{$daughter_goids[$i]} = 1; | |
@all_goids = (@all_goids,$daughter_goids[$i]); | |
} | |
} | |
} | |
} | |
else | |
{ | |
if ($PROVISIONAL_GOID->{$daughter_name}) | |
{ | |
$daughter_goids = $PROVISIONAL_GOID->{$daughter_name}; | |
if ($VERBOSE_PREORDER == 1) { print "___ daughter $daughter_name has goids $daughter_goids\n";} | |
@daughter_goids = split(/\,/,$daughter_goids); | |
for ($i = 0; $i <= $#daughter_goids; $i++) | |
{ | |
$DAUGHTER_GOID{$no_daughters."_".$daughter_goids[$i]} = 1; | |
if (!($SEEN_GOID{$daughter_goids[$i]})) | |
{ | |
$SEEN_GOID{$daughter_goids[$i]} = 1; | |
@all_goids = (@all_goids,$daughter_goids[$i]); | |
} | |
} | |
} | |
} | |
} | |
} | |
# FIND THE GO IDS THAT ARE SHARED BY ALL THE DAUGHTERS OF NODE $node: | |
$confirmed_goid = ""; | |
for ($i = 0; $i <= $#all_goids; $i++) | |
{ | |
$goid = $all_goids[$i]; | |
$shared_goid = 1; | |
for ($j = 1; $j <= $no_daughters; $j++) | |
{ | |
if (!($DAUGHTER_GOID{$j."_".$goid})) { $shared_goid = 0;} | |
} | |
if ($shared_goid == 1) | |
{ | |
$confirmed_goid = $confirmed_goid.",".$goid; | |
} | |
} | |
if ($confirmed_goid ne "") | |
{ | |
$confirmed_goid = substr($confirmed_goid,1,length($confirmed_goid)-1); | |
if ($VERBOSE_PREORDER == 1) { print "___ node $node_name has confirmed goid $confirmed_goid\n";} | |
$CONFIRMED_GOID->{$node_name} = $confirmed_goid; | |
} | |
} | |
#------------------------------------------------------------------# | |
# RECORD THE GO TERMS FOR PARENT $parent. IN THE POST-ORDER | |
# TRAVERSAL, WE ASSIGN TO EACH INTERNAL NODE A SET OF PROVISIONAL | |
# TERMS THAT IS THE UNION OF THE TERMS ASSIGNED TO ITS CHILDREN: | |
sub assign_provisional_go_terms | |
{ | |
my $GOID = $_[0]; # THE GO TERMS. | |
my $parent = $_[1]; # | |
my $parent_name = $_[2]; # | |
my $PROVISIONAL_GOID = $_[3]; # | |
my $VERBOSE_POSTORDER = $_[4]; # | |
my $daughter; # | |
my $daughter_name; # | |
my $daughter_seqname; # | |
my $daughter_goids; # | |
my @daughter_goids; # | |
my $provisional_goid = ""; # | |
my $i; # | |
my %SEEN = (); # | |
if ($VERBOSE_POSTORDER == 1) { print "___ finding provisional goids for parent $parent_name\n";} | |
# FIND THE CHILDREN OF NODE $parent: | |
if (!($parent->is_leaf)) # IF THIS IS NOT A SEQUENCE. | |
{ | |
# GET THE TWO DAUGHTER NODES OF $parent: | |
foreach $daughter ($parent->children) | |
{ | |
if ($daughter->is_leaf) { $daughter_name = $daughter->id(); } | |
else { $daughter_name = $daughter->internalID;} | |
if ($VERBOSE_POSTORDER == 1) { print "___ looking at daughter $daughter_name of parent $parent_name\n";} | |
# FIND THE GO TERMS OF NODE $daughter: | |
if ($daughter->is_leaf) | |
{ | |
$daughter_seqname = $daughter->get_tag_value('O'); | |
if ($GOID->{$daughter_seqname}) | |
{ | |
$daughter_goids = $GOID->{$daughter_seqname}; | |
if ($VERBOSE_POSTORDER == 1) { print "___ daughter $daughter_name ($daughter_seqname) has goids $daughter_goids\n";} | |
@daughter_goids = split(/\,/,$daughter_goids); | |
for ($i = 0; $i <= $#daughter_goids; $i++) | |
{ | |
if (!($SEEN{$daughter_goids[$i]})) | |
{ | |
$SEEN{$daughter_goids[$i]} = 1; | |
$provisional_goid = $provisional_goid.",".$daughter_goids[$i]; | |
} | |
} | |
} | |
} | |
else | |
{ | |
if ($PROVISIONAL_GOID->{$daughter_name}) | |
{ | |
$daughter_goids = $PROVISIONAL_GOID->{$daughter_name}; | |
if ($VERBOSE_POSTORDER == 1) { print "___ daughter $daughter_name has goids $daughter_goids\n";} | |
@daughter_goids = split(/\,/,$daughter_goids); | |
for ($i = 0; $i <= $#daughter_goids; $i++) | |
{ | |
if (!($SEEN{$daughter_goids[$i]})) | |
{ | |
$SEEN{$daughter_goids[$i]} = 1; | |
$provisional_goid = $provisional_goid.",".$daughter_goids[$i]; | |
} | |
} | |
} | |
} | |
} | |
} | |
else { print STDERR "ERROR: parent $parent_name is a sequence.\n"; exit;} | |
if ($provisional_goid ne "") | |
{ | |
$provisional_goid = substr($provisional_goid,1,length($provisional_goid)-1); | |
if ($VERBOSE_POSTORDER == 1) { print "___ parent $parent_name has provisional goid $provisional_goid\n";} | |
$PROVISIONAL_GOID->{$parent_name} = $provisional_goid; | |
} | |
} | |
#------------------------------------------------------------------# | |
# DO A POSTORDER TRAVERSAL OF THE TREE: | |
sub do_postorder_traversal | |
{ | |
my $tree = $_[0]; # THE TREE OBJECT. | |
my $GOID = $_[1]; # THE GO IDS. | |
my $root; # | |
my $root_name; # | |
my @nodes; # | |
my @parents; # | |
my $no_rounds; # | |
my $no_new_nodes; # | |
my %SEEN_NODE = (); # | |
my %SEEN_PARENT = (); # | |
my $node; # | |
my $node_name; # | |
my $VERBOSE_POSTORDER = 1; # | |
my $parent; # | |
my $parent_name; # | |
my $daughter; # | |
my $daughter_name; # | |
my $i; # | |
my %PROVISIONAL_GOID = (); # | |
# FIND THE ROOT OF THE TREE: | |
$root = $tree->root; | |
$root_name = $root->internalID; | |
# LIST ALL THE LEAVES FROM THE TREE: | |
@nodes = &Avril_modules::empty_array(@nodes); | |
@parents = &Avril_modules::empty_array(@parents); | |
@nodes = $tree->get_leaves; | |
$no_rounds = 0; | |
NEXT_ROUND1: | |
$no_rounds++; | |
$no_new_nodes = 0; | |
%SEEN_NODE = (); | |
%SEEN_PARENT = (); | |
# LOOP THROUGH EACH NODE IN THE ARRAY @nodes. WHEN $no_rounds=1, | |
# THIS ARRAY CONTAINS THE LEAVES. WHEN $no_rounds=2, THIS ARRAY | |
# CONTAINS THEIR PARENTS. WHEN $no_rounds=3, THIS ARRAY CONTAINS | |
# THEIR GRANDPARENTS, AND SO ON. | |
foreach $node(@nodes) | |
{ | |
if ($node->is_leaf) { $node_name = $node->id(); } | |
else { $node_name = $node->internalID(); } | |
if ($VERBOSE_POSTORDER == 1) { print "$no_rounds: visited node $node_name\n";} | |
if (!($SEEN_NODE{$node_name})) { $SEEN_NODE{$node_name} = 1; $no_new_nodes++;} | |
# FIND THE PARENT OF NODE $node: | |
if (defined($node->parent)) { $parent = $node->parent; } | |
else { print STDERR "ERROR: do not know parent of node $node_name.\n"; exit;} | |
$parent_name = $parent->internalID; | |
# FIND THE OTHER CHILD OF NODE $parent: | |
if (!($parent->is_leaf)) # IF THIS IS NOT A SEQUENCE. | |
{ | |
# GET THE TWO DAUGHTER NODES OF $parent: | |
foreach $daughter ($parent->children) | |
{ | |
if ($daughter->is_leaf) { $daughter_name = $daughter->id(); } | |
else { $daughter_name = $daughter->internalID;} | |
if ($daughter_name ne $node_name) | |
{ | |
if ($VERBOSE_POSTORDER == 1) { print "$no_rounds: visited sibling $daughter_name\n";} | |
if (!($SEEN_NODE{$node_name})) { $SEEN_NODE{$node_name} = 1; $no_new_nodes++;} | |
} | |
} | |
} | |
else { print STDERR "ERROR: parent $parent_name is a sequence.\n"; exit;} | |
# RECORD THAT WE VISITED THE PARENT $parent: | |
if ($VERBOSE_POSTORDER == 1) { print "$no_rounds: saw parent $parent_name\n";} | |
if (!($SEEN_PARENT{$parent_name}) && $parent_name ne $root_name) | |
{ | |
@parents = (@parents,$parent); | |
$SEEN_PARENT{$parent_name} = 1; | |
# RECORD THE GO TERMS FOR PARENT $parent. IN THE POST-ORDER | |
# TRAVERSAL, WE ASSIGN TO EACH INTERNAL NODE A SET OF PROVISIONAL | |
# TERMS THAT IS THE UNION OF THE TERMS ASSIGNED TO ITS CHILDREN: | |
&assign_provisional_go_terms($GOID,$parent,$parent_name,\%PROVISIONAL_GOID,$VERBOSE_POSTORDER); | |
} | |
} | |
if ($VERBOSE_POSTORDER == 1) { print "$no_rounds rounds: number new = $no_new_nodes\n";} | |
# IF WE DID SEE SOME NEW NODES ON THIS TRAVERSAL: | |
if ($no_new_nodes > 0) | |
{ | |
@nodes = &Avril_modules::empty_array(@nodes); | |
for ($i = 0; $i <= $#parents; $i++) { @nodes = (@nodes,$parents[$i]);} | |
@parents = &Avril_modules::empty_array(@parents); | |
goto NEXT_ROUND1; | |
} | |
# VISIT ROOT: | |
if ($VERBOSE_POSTORDER == 1) { print "$no_rounds: visited root $root_name\n";} | |
&assign_provisional_go_terms($GOID,$root,$root_name,\%PROVISIONAL_GOID,$VERBOSE_POSTORDER); | |
if ($VERBOSE_POSTORDER == 1) { print "Finished post-order traversal\n";} | |
return(\%PROVISIONAL_GOID); | |
} | |
#------------------------------------------------------------------# | |
# CHECK WHETHER THE CONFIRMED GO TERMS FOR NODE $node ARE | |
# DIFFERENT FROM THOSE OF ITS CHILDREN. | |
sub compare_parent_to_children | |
{ | |
my $CONFIRMED_GOID = $_[0]; # | |
my $node = $_[1]; # | |
my $node_name = $_[2]; # | |
my $VERBOSE_CHANGES = $_[3]; # | |
my $GOID = $_[4]; # | |
my $daughter; # | |
my $daughter_name; # | |
my $daughter_seqname; # | |
my $daughter_goids; # | |
my $node_goids; # | |
my @daughter_goids; # | |
my @node_goids; # | |
my %NODE_GOID = (); # | |
my %DAUGHTER_GOID = (); # | |
my $i; # | |
my $goid; # | |
# FIND THE GOIDS FOR $node: | |
if (!($CONFIRMED_GOID->{$node_name})) { $node_goids = "none"; } | |
else { $node_goids = $CONFIRMED_GOID->{$node_name};} | |
if ($VERBOSE_CHANGES == 1) { print "___ comparing parent $node_name ($node_goids) to its children\n";} | |
# FIND THE CHILDREN OF NODE $node: | |
if (!($node->is_leaf)) # IF THIS IS NOT A SEQUENCE. | |
{ | |
# GET THE TWO DAUGHTER NODES OF $node: | |
foreach $daughter ($node->children) | |
{ | |
if ($daughter->is_leaf) { $daughter_name = $daughter->id(); } | |
else { $daughter_name = $daughter->internalID;} | |
if ($VERBOSE_CHANGES == 1) { print "___ comparing parent $node_name to daughter $daughter_name\n";} | |
# FIND THE CONFIRMED GO TERMS OF NODE $daughter: | |
$daughter_goids = ""; | |
if ($daughter->is_leaf) | |
{ | |
$daughter_seqname = $daughter->get_tag_value('O'); | |
if ($GOID->{$daughter_seqname}) | |
{ | |
$daughter_goids = $GOID->{$daughter_seqname}; | |
} | |
else { $daughter_goids = "none";} | |
if ($VERBOSE_CHANGES == 1) { print "___ daughter $daughter_name ($daughter_seqname) has goids $daughter_goids\n";} | |
} | |
else | |
{ | |
if ($CONFIRMED_GOID->{$daughter_name}) { $daughter_goids = $CONFIRMED_GOID->{$daughter_name};} | |
else { $daughter_goids = "none"; } | |
if ($VERBOSE_CHANGES == 1) { print "___ daughter $daughter_name has goids $daughter_goids\n";} | |
} | |
# CHECK IF $daughter_goids IS DIFFERENT FROM $node_goids: | |
@node_goids = split(/\,/,$node_goids); | |
@daughter_goids = split(/\,/,$daughter_goids); | |
%NODE_GOID = (); | |
%DAUGHTER_GOID = (); | |
# RECORD ALL THE GOIDS FOR $node: | |
for ($i = 0; $i <= $#node_goids; $i++) | |
{ | |
if ($node_goids[$i] ne 'none') | |
{ | |
$NODE_GOID{$node_goids[$i]} = 1; | |
} | |
} | |
# RECORD ALL THE GOIDS FOR $daughter: | |
for ($i = 0; $i <= $#daughter_goids; $i++) | |
{ | |
if ($daughter_goids[$i] ne 'none') | |
{ | |
$DAUGHTER_GOID{$daughter_goids[$i]} = 1; | |
} | |
} | |
# CHECK IF $node HAS ANY GOIDS THAT $daughter DOES NOT HAVE: | |
foreach $goid (keys %NODE_GOID) | |
{ | |
if (!($DAUGHTER_GOID{$goid})) | |
{ | |
print "*** node $node_name has GOID $goid, that daughter $daughter_name does not have\n"; | |
} | |
} | |
# CHECK IF $daughter HAS ANY GOIDS THAT $node DOES NOT HAVE: | |
foreach $goid (keys %DAUGHTER_GOID) | |
{ | |
if (!($NODE_GOID{$goid})) | |
{ | |
print "*** daughter $daughter_name has GOID $goid, that parent $node_name does not have\n"; | |
} | |
} | |
} | |
} | |
else { print STDERR "ERROR: parent $node_name is a sequence.\n"; exit;} | |
} | |
#------------------------------------------------------------------# | |
# FIND NODES IN THE TREE WHERE CONFIRMED GO TERMS OF A PARENT NODE | |
# DIFFERS FROM THOSE OF ITS DAUGHTER NODES: | |
sub find_changes_in_goid | |
{ | |
my $tree = $_[0]; # | |
my $CONFIRMED_GOID = $_[1]; # | |
my $GOID = $_[2]; # | |
my $root; # | |
my $root_name; # | |
my @nodes; # | |
my @children; # | |
my $no_rounds; # | |
my $no_new_nodes; # | |
my %SEEN_NODE = (); # | |
my %SEEN_CHILD = (); # | |
my $node; # | |
my $node_name; # | |
my $VERBOSE_CHANGES = 1; # | |
my $child; # | |
my $child_name; # | |
my $i; # | |
# FIND THE ROOT OF THE TREE: | |
$root = $tree->root; | |
$root_name = $root->internalID; | |
# ARRAY @nodes STARTS BY CONTAINING JUST THE ROOT NODE: | |
@nodes = &Avril_modules::empty_array(@nodes); | |
@children = &Avril_modules::empty_array(@children); | |
@nodes = (@nodes,$root); | |
$no_rounds = 0; | |
NEXT_ROUND3: | |
$no_rounds++; | |
$no_new_nodes = 0; | |
%SEEN_NODE = (); | |
%SEEN_CHILD = (); | |
# LOOP THROUGH EACH NODE IN THE ARRAY @nodes. WHEN $no_rounds=1, | |
# THIS ARRAY CONTAINS THE ROOT. WHEN $no_rounds=2, THIS ARRAY | |
# CONTAINS ITS CHILDREN. WHEN $no_rounds=3, THIS ARRAY CONTAINS | |
# ITS GRANDCHILDREN, AND SO ON. | |
foreach $node(@nodes) | |
{ | |
if ($node->is_leaf) { $node_name = $node->id(); } | |
else { $node_name = $node->internalID(); } | |
if ($VERBOSE_CHANGES == 1) { print "$no_rounds: visited node $node_name\n";} | |
if (!($SEEN_NODE{$node_name})) { $SEEN_NODE{$node_name} = 1; $no_new_nodes++;} | |
# CHECK WHETHER THE CONFIRMED GO TERMS FOR NODE $node ARE | |
# DIFFERENT FROM THOSE OF ITS CHILDREN. | |
if (!($node->is_leaf)) | |
{ | |
&compare_parent_to_children($CONFIRMED_GOID,$node,$node_name,$VERBOSE_CHANGES,$GOID); | |
} | |
# FIND THE CHILDREN OF NODE $node: | |
if (!($node->is_leaf)) # IF THIS IS NOT A SEQUENCE. | |
{ | |
# GET THE TWO DAUGHTER NODES OF $node: | |
foreach $child ($node->children) | |
{ | |
if ($child->is_leaf) { $child_name = $child->id(); } | |
else { $child_name = $child->internalID;} | |
if ($VERBOSE_CHANGES == 1) { print "$no_rounds: saw child $child_name\n"; } | |
# RECORD THAT WE SAW THE CHILD $child: | |
if (!($SEEN_CHILD{$child_name})) | |
{ | |
@children = (@children,$child); | |
$SEEN_CHILD{$child_name} = 1; | |
} | |
} | |
} # ELSE, THIS NODE IS A SEQUENCE, SO DOES NOT HAVE ANY CHILDREN. | |
} | |
if ($VERBOSE_CHANGES == 1) { print "$no_rounds rounds: number new = $no_new_nodes\n";} | |
# IF WE DID SEE SOME NEW NODES ON THIS TRAVERSAL: | |
if ($no_new_nodes > 0) | |
{ | |
@nodes = &Avril_modules::empty_array(@nodes); | |
for ($i = 0; $i <= $#children; $i++) { @nodes = (@nodes,$children[$i]);} | |
@children = &Avril_modules::empty_array(@children); | |
goto NEXT_ROUND3; | |
} | |
if ($VERBOSE_CHANGES == 1) { print "Finished searching for changes in GO id\n";} | |
} | |
#------------------------------------------------------------------# | |
# DO A PREORDER TRAVERSAL OF THE TREE TO ASSIGN CONFIRMED GO TERMS: | |
sub do_preorder_traversal | |
{ | |
my $tree = $_[0]; # | |
my $PROVISIONAL_GOID = $_[1]; # | |
my $GOID = $_[2]; # | |
my %CONFIRMED_GOID = (); # | |
my $root; # | |
my $root_name; # | |
my @nodes; # | |
my @children; # | |
my $no_rounds = 0; # | |
my $no_new_nodes; # | |
my %SEEN_NODE = (); # | |
my %SEEN_CHILD = (); # | |
my $node; # | |
my $node_name; # | |
my $VERBOSE_PREORDER = 1; # | |
my $child; # | |
my $child_name; # | |
my $i; # | |
my $node_type; # | |
# FIND THE ROOT OF THE TREE: | |
$root = $tree->root; | |
$root_name = $root->internalID; | |
# ARRAY @nodes STARTS BY CONTAINING JUST THE ROOT NODE: | |
@nodes = &Avril_modules::empty_array(@nodes); | |
@children = &Avril_modules::empty_array(@children); | |
@nodes = (@nodes,$root); | |
$no_rounds = 0; | |
NEXT_ROUND2: | |
$no_rounds++; | |
$no_new_nodes = 0; | |
%SEEN_NODE = (); | |
%SEEN_CHILD = (); | |
# LOOP THROUGH EACH NODE IN THE ARRAY @nodes. WHEN $no_rounds=1, | |
# THIS ARRAY CONTAINS THE ROOT. WHEN $no_rounds=2, THIS ARRAY | |
# CONTAINS ITS CHILDREN. WHEN $no_rounds=3, THIS ARRAY CONTAINS | |
# ITS GRANDCHILDREN, AND SO ON. | |
foreach $node(@nodes) | |
{ | |
if ($node->is_leaf) { $node_name = $node->id(); } | |
else { $node_name = $node->internalID(); } | |
if ($VERBOSE_PREORDER == 1) { print "$no_rounds: visited node $node_name\n";} | |
if (!($SEEN_NODE{$node_name})) { $SEEN_NODE{$node_name} = 1; $no_new_nodes++;} | |
# RECORD THE GO TERMS FOR NODE $node. IN THE PRE-ORDER TRAVERSAL, | |
# WE ADD TO THE CONFIRMED TERM LIST OF A NODE THOSE PROVISIONAL | |
# TERMS PRESENT IN BOTH CHILDREN OF THE CURRENT NODE: | |
&assign_confirmed_go_terms($PROVISIONAL_GOID,$node,$node_name,\%CONFIRMED_GOID,$VERBOSE_PREORDER,$GOID); | |
# IF THIS NODE IS AN INTERNAL NODE: | |
if (!($node->is_leaf)) | |
{ | |
# CHECK IF $node IS A SPECIATION NODE OR A DUPLICATION NODE: | |
# $node_type HAS VALUE Y FOR DUPLICATION, N FOR SPECIATION. | |
$node_type = $node->get_tag_value('D'); | |
# IF $node IS A SPECIATION NODE, ADD ALL ITS CONFIRMED TERMS TO THE | |
# CONFIRMED LISTS OF BOTH ITS CHILDREN: | |
if ($node_type eq 'N') { &add_confirmed_terms_to_children($node,$node_name,\%CONFIRMED_GOID,$VERBOSE_PREORDER);} | |
# IF $node IS A DUPLICATION NODE, ONLY CONFIRM IN THE CHILDREN THOSE | |
# CONFIRMED TERMS IN THE CURRENT NODE THAT ARE PROVISIONAL TERMS IN | |
# THE CHILD NODE. | |
elsif ($node_type eq 'Y') { &add_confirmed_terms_to_children2($node,$node_name,\%CONFIRMED_GOID,$VERBOSE_PREORDER, | |
$PROVISIONAL_GOID,$GOID);} | |
else { print STDERR "ERROR: node_type $node_type.\n"; exit;} | |
} | |
# FIND THE CHILDREN OF NODE $node: | |
if (!($node->is_leaf)) # IF THIS IS NOT A SEQUENCE. | |
{ | |
# GET THE TWO DAUGHTER NODES OF $node: | |
foreach $child ($node->children) | |
{ | |
if ($child->is_leaf) { $child_name = $child->id(); } | |
else { $child_name = $child->internalID;} | |
if ($VERBOSE_PREORDER == 1) { print "$no_rounds: saw child $child_name\n"; } | |
# RECORD THAT WE SAW THE CHILD $child: | |
if (!($SEEN_CHILD{$child_name})) | |
{ | |
@children = (@children,$child); | |
$SEEN_CHILD{$child_name} = 1; | |
} | |
} | |
} # ELSE, THIS NODE IS A SEQUENCE, SO DOES NOT HAVE ANY CHILDREN. | |
} | |
if ($VERBOSE_PREORDER == 1) { print "$no_rounds rounds: number new = $no_new_nodes\n";} | |
# IF WE DID SEE SOME NEW NODES ON THIS TRAVERSAL: | |
if ($no_new_nodes > 0) | |
{ | |
@nodes = &Avril_modules::empty_array(@nodes); | |
for ($i = 0; $i <= $#children; $i++) { @nodes = (@nodes,$children[$i]);} | |
@children = &Avril_modules::empty_array(@children); | |
goto NEXT_ROUND2; | |
} | |
if ($VERBOSE_PREORDER == 1) { print "Finished pre-order traversal\n";} | |
return(\%CONFIRMED_GOID); | |
} | |
#------------------------------------------------------------------# | |
# INFER THE GO IDS OF ANCESTRAL NODES IN THESE FAMILIES: | |
sub infer_ancestral_nodes | |
{ | |
my $GOID = $_[0]; # | |
my $families = $_[1]; #> | |
my $i; #> | |
my $family_id; #> | |
my $tree; #> | |
my $dbc; #> | |
my $family; #> | |
my $famh; #> | |
my $leaf; #> | |
my $leaf2; # | |
my $leaf_name; #> | |
my $leaf2_name; # | |
my $leaf2_species; # | |
my $leaf_species; # | |
my $leaf2_id; # | |
my $leaf_id; # | |
my @temp; # | |
my $lca; # | |
my $lca_name; # | |
my $lca_type; # | |
my $parent; #> | |
my $parent_name; #> | |
my $parent_type; # | |
my $duplications1; # | |
my $duplications2; # | |
my $leaf_goids; # | |
my $leaf2_goids; # | |
my @leaf_goids; # | |
my @leaf2_goids; # | |
my $root; # | |
my $root_name; # | |
my $leaf_steps_to_root; # | |
my $leaf2_steps_to_root; # | |
my $k; # | |
my $m; # | |
my %SEEN_GOID = (); # | |
my @nodes; # | |
my $node; # | |
my $node_name; # | |
my $node_type; # | |
my $no_daughters; # | |
my $daughter1; # | |
my $daughter2; # | |
my $daughter1_name; # | |
my $daughter2_name; # | |
my $daughter; #> | |
my $daughter_name; #> | |
my @subnodes; # | |
my %SEEN_SUBNODE = (); # | |
my $subnode; # | |
my $subnode_name; # | |
my $new; # | |
my $p; # | |
my $subsubnode; # | |
my @parents; | |
my $no_rounds; | |
my $no_new_nodes; | |
my %SEEN_NODE = (); | |
my %SEEN_PARENT = (); | |
my $PROVISIONAL_GOID; | |
my $CONFIRMED_GOID; | |
# CONNECT TO THE TREEFAM DATABASE USING DEFAULT CONFIGURATION: | |
$dbc = Treefam::DBConnection->new(); | |
for ($i = 0; $i < @$families; $i++) | |
{ | |
$family_id = $families->[$i]; # eg. TF314159 | |
if ($VERBOSE3 == 1) { print "Looking at family $family_id\n";} | |
# GET A FAMILY: | |
$famh = $dbc->get_FamilyHandle(); | |
$family = $famh->get_by_id($family_id); | |
if (!(defined($family))) | |
{ | |
print "Do not have $family_id in the database...\n"; | |
goto NEXT_FAMILY; | |
} | |
# GET THE CLEAN TREE FOR THIS FAMILY: | |
$tree = $family->get_tree('clean'); | |
# DO A POSTORDER TRAVERSAL OF THE TREE TO ASSIGN PROVISIONAL GO TERMS: | |
$PROVISIONAL_GOID = &do_postorder_traversal($tree,$GOID); | |
# DO A PREORDER TRAVERSAL OF THE TREE TO ASSIGN CONFIRMED GO TERMS: | |
$CONFIRMED_GOID = &do_preorder_traversal($tree,$PROVISIONAL_GOID,$GOID); | |
# FIND NODES IN THE TREE WHERE CONFIRMED GO TERMS OF A PARENT NODE | |
# DIFFERS FROM THOSE OF ITS DAUGHTER NODES: | |
&find_changes_in_goid($tree,$CONFIRMED_GOID,$GOID); | |
exit; #xxx | |
# LIST ALL THE LEAVES FROM THE TREE: | |
foreach $leaf($tree->get_leaves) | |
{ | |
# COMPARE TO ALL OTHER LEAVES IN THE TREE: | |
foreach $leaf2($tree->get_leaves) | |
{ | |
# SEE IF $leaf AND $leaf2 HAVE GO IDS ASSIGNED: | |
$leaf_name = $leaf->id(); | |
$leaf2_name = $leaf2->id(); | |
$leaf_id = $leaf->get_tag_value('O'); | |
$leaf2_id = $leaf2->get_tag_value('O'); | |
@temp = split(/_/,$leaf_name); | |
$leaf_species = $temp[1]; | |
@temp = split(/_/,$leaf2_name); | |
$leaf2_species = $temp[1]; | |
# IF THESE LEAVES BELONG TO SPECIES THAT WE ARE INTERESTED IN: | |
if (($leaf_species eq 'DROME' || $leaf_species eq 'MOUSE' || $leaf_species eq 'RAT' || | |
$leaf_species eq 'CAEEL' || $leaf_species eq 'HUMAN' || $leaf_species eq 'BRARE' || | |
$leaf_species eq 'CHICK') && | |
($leaf2_species eq 'DROME' || $leaf2_species eq 'MOUSE' || $leaf2_species eq 'RAT' || | |
$leaf2_species eq 'CAEEL' || $leaf2_species eq 'HUMAN' || $leaf2_species eq 'BRARE' || | |
$leaf2_species eq 'CHICK')) | |
{ | |
# IF BOTH LEAVES HAVE GO IDS ANNOTATED: | |
if ($GOID->{$leaf_id} && $GOID->{$leaf2_id} && $leaf_id ne $leaf2_id) | |
{ | |
# FIND THE LAST COMMON ANCESTOR NODE OF $leaf AND $leaf2: | |
$lca = $tree->get_last_common_ancestor($leaf,$leaf2); | |
$lca_name= $lca->internalID; | |
$lca_type = $lca->get_tag_value('D'); | |
if ($lca_type eq 'Y') # THE ANCESTRAL NODE IS A DUPLICATION: | |
{ | |
$duplications1 = 1; $duplications2 = 1; | |
$leaf_steps_to_root = "-1"; $leaf2_steps_to_root = -1; | |
goto FOUND_ROOT2; | |
} | |
# SEE IF THERE ARE ANY DUPLICATION NODES BETWEEN $leaf AND $lca: | |
$duplications1 = 0; | |
if (defined($leaf->parent)) { $parent = $leaf->parent; } | |
else { print STDERR "ERROR: do not know parent of leaf $leaf_id.\n"; exit;} | |
# $parent_type HAS VALUE Y FOR DUPLICATION, N FOR SPECIATION. | |
$parent_type = $parent->get_tag_value('D'); | |
if ($parent_type eq 'Y') { $duplications1 = 1;} | |
$parent_name = $parent->internalID; | |
if ($parent_name eq $lca_name) { goto FOUND_LCA1;} | |
FIND_PARENT1: | |
if (defined($parent->parent)) | |
{ | |
$parent = $parent->parent; | |
$parent_type = $parent->get_tag_value('D'); | |
if ($parent_type eq 'Y') { $duplications1 = 1;} | |
$parent_name = $parent->internalID; | |
if ($parent_name eq $lca_name) { goto FOUND_LCA1;} | |
goto FIND_PARENT1; | |
} | |
FOUND_LCA1: | |
# FIND HOW MANY STEPS THERE ARE BETWEEN $leaf AND THE ROOT $root: | |
$leaf_steps_to_root = 0; | |
if (defined($leaf->parent)) { $parent = $leaf->parent; } | |
else { print STDERR "ERROR: do not know parent of leaf $leaf_id.\n"; exit;} | |
$parent_type = $parent->get_tag_value('D'); | |
if ($parent_type eq 'N') { $leaf_steps_to_root++;} | |
$parent_name = $parent->internalID; | |
if ($parent_name eq $root_name) { goto FOUND_ROOT1;} | |
FIND_ROOT1: | |
if (defined($parent->parent)) | |
{ | |
$parent = $parent->parent; | |
$parent_type = $parent->get_tag_value('D'); | |
if ($parent_type eq 'N') { $leaf_steps_to_root++;} | |
$parent_name = $parent->internalID; | |
if ($parent_name eq $root_name) { goto FOUND_ROOT1;} | |
goto FIND_ROOT1; | |
} | |
FOUND_ROOT1: | |
# SEE IF THERE ARE ANY DUPLICATION NODES BETWEEN $leaf2 AND $lca: | |
$duplications2 = 0; | |
if (defined($leaf2->parent)) { $parent = $leaf2->parent; } | |
else { print STDERR "ERROR: do not know parent of leaf2 $leaf2_id.\n"; exit;} | |
# $parent_type HAS VALUE Y FOR DUPLICATION, N FOR SPECIATION. | |
$parent_type = $parent->get_tag_value('D'); | |
if ($parent_type eq 'Y') { $duplications2 = 1;} | |
$parent_name = $parent->internalID; | |
if ($parent_name eq $lca_name) { goto FOUND_LCA2;} | |
FIND_PARENT2: | |
if (defined($parent->parent)) | |
{ | |
$parent = $parent->parent; | |
$parent_type = $parent->get_tag_value('D'); | |
if ($parent_type eq 'Y') { $duplications2 = 1;} | |
$parent_name = $parent->internalID; | |
if ($parent_name eq $lca_name) { goto FOUND_LCA2;} | |
goto FIND_PARENT2; | |
} | |
FOUND_LCA2: | |
# FIND HOW MANY STEPS THERE ARE BETWEEN $leaf2 AND THE ROOT $root: | |
$leaf2_steps_to_root = 0; | |
if (defined($leaf2->parent)) { $parent = $leaf2->parent; } | |
else { print STDERR "ERROR: do not know parent of leaf2 $leaf2_id.\n"; exit;} | |
$parent_type = $parent->get_tag_value('D'); | |
if ($parent_type eq 'N') { $leaf2_steps_to_root++;} | |
$parent_name = $parent->internalID; | |
if ($parent_name eq $root_name) { goto FOUND_ROOT2;} | |
FIND_ROOT2: | |
if (defined($parent->parent)) | |
{ | |
$parent = $parent->parent; | |
$parent_type = $parent->get_tag_value('D'); | |
if ($parent_type eq 'N') { $leaf2_steps_to_root++;} | |
$parent_name = $parent->internalID; | |
if ($parent_name eq $root_name) { goto FOUND_ROOT2;} | |
goto FIND_ROOT2; | |
} | |
FOUND_ROOT2: | |
# IF THERE ARE NO DUPLICATIONS, WE CAN INFER THAT THE NODE $lca | |
# SHARES THE GO IDS THAT $leaf1 AND $leaf2 HAVE IN COMMON: | |
if ($VERBOSE == 1) | |
{ | |
print "\n$family_id $leaf_id $GOID->{$leaf_id} $leaf2_id $GOID->{$leaf2_id}\n"; | |
print "$family_id duplications1 $duplications1 duplications2 $duplications2 lca_type $lca_type lca_name $lca_name\n"; | |
print "$family_id leaf_steps_to_root $leaf_steps_to_root leaf2_steps_to_root $leaf2_steps_to_root\n"; | |
} | |
if (($duplications1 == 0 && $duplications2 == 0) || # THERE ARE NO DUPLICATIONS BETWEEN THE GENES | |
($duplications1 == 1 && $duplications2 == 0 && | |
$leaf_steps_to_root > $leaf2_steps_to_root) || # THERE ARE DUPLICATIONS BETWEEN | |
# LCA AND $leaf, BUT $leaf2 IS AN OUTGROUP | |
($duplications1 == 0 && $duplications2 == 1 && | |
$leaf_steps_to_root < $leaf2_steps_to_root)) # THERE ARE DUPLICATIONS BETWEEN | |
# LCA AND $leaf2, BUT $leaf IS AN OUTGROUP | |
{ | |
$leaf_goids = $GOID->{$leaf_id}; | |
$leaf2_goids= $GOID->{$leaf2_id}; | |
@leaf_goids = split(/\,/,$leaf_goids); | |
@leaf2_goids= split(/\,/,$leaf2_goids); | |
# CHECK WHETHER THE TWO SEQUENCES HAVE ANY GO IDS IN COMMON: | |
for ($k = 0; $k <= $#leaf_goids; $k++) | |
{ | |
for ($m = 0; $m <= $#leaf2_goids; $m++) | |
{ | |
if ($leaf_goids[$k] eq $leaf2_goids[$m]) | |
{ | |
if (!($SEEN_GOID{$family_id."=".$lca_name."=".$leaf_goids[$k]})) | |
{ | |
$SEEN_GOID{$family_id."=".$lca_name."=".$leaf_goids[$k]} = 1; | |
print "$family_id: assigning $leaf_goids[$k] to node $lca_name ($leaf_id $leaf2_id)\n"; | |
if (!($GOID->{$family_id."=".$lca_name})) { $GOID->{$family_id."=".$lca_name} = $leaf_goids[$k];} | |
else { $GOID->{$family_id."=".$lca_name} = $GOID->{$family_id."=".$lca_name}.",".$leaf_goids[$k];} | |
} | |
} | |
} | |
} | |
} | |
} | |
} | |
} | |
} | |
# GET ALL THE NODES IN THE TREE: | |
print "Getting all nodes in tree for $family_id...\n"; | |
@nodes = $tree->get_all_nodes; | |
foreach $node(@nodes) | |
{ | |
$node_name = $node->internalID; | |
if (!($node->is_leaf)) # IF THIS IS NOT A SEQUENCE. | |
{ | |
$node_type = $node->get_tag_value('D'); | |
if ($node_type eq 'Y') # THE NODE IS A DUPLICATION: | |
{ | |
# GET THE TWO DAUGHTER NODES OF $node: | |
$no_daughters = 0; | |
$daughter1 = ""; | |
$daughter2 = ""; | |
foreach $daughter ($node->children) | |
{ | |
$no_daughters++; | |
$daughter_name = $daughter->internalID; | |
if ($no_daughters == 1) { $daughter1_name = $daughter_name;} | |
elsif ($no_daughters == 2) { $daughter2_name = $daughter_name;} | |
@subnodes = &Avril_modules::empty_array(@subnodes); | |
@subnodes = (@subnodes,$daughter); | |
%SEEN_SUBNODE = (); | |
$SEEN_SUBNODE{$daughter} = 1; | |
# GET ALL THE DESCENDANTS OF $daughter: | |
LOOP_AGAIN1: | |
$new = 0; | |
for ($p = 0; $p <= $#subnodes; $p++) | |
{ | |
$subnode = $subnodes[$p]; | |
if (!($subnode->is_leaf)) | |
{ | |
foreach $subsubnode ($subnode->children) | |
{ | |
if (!($SEEN_SUBNODE{$subsubnode})) | |
{ | |
$SEEN_SUBNODE{$subsubnode} = 1; | |
$new++; | |
@subnodes = (@subnodes,$subsubnode); | |
} | |
} | |
} | |
} | |
if ($new > 0) { goto LOOP_AGAIN1;} | |
# FIND ALL THE GO IDS OF THE DESCENDANTS OF $daughter: | |
for ($p = 0; $p <= $#subnodes; $p++) | |
{ | |
$subnode = $subnodes[$p]; | |
if (!($subnode->is_leaf)) # IF IT IS NOT A LEAF NODE. | |
{ | |
$subnode_name = $subnode->internalID; | |
$subnode_name = $family_id."=".$subnode_name; | |
# CHECK IF THERE ARE GO SLIM IDS FOR THIS NODE: | |
if ($GOID->{$subnode_name}) | |
{ | |
print "$family_id = node $node_name => $daughter_name => $subnode_name: go $GOID->{$subnode_name}\n"; | |
if ($no_daughters == 1) { $daughter1 = $daughter1.",".$GOID->{$subnode_name};} | |
elsif ($no_daughters == 2) { $daughter2 = $daughter2.",".$GOID->{$subnode_name};} | |
} | |
} | |
} | |
} | |
# COMPARE THE GO IDS FOR $daughter1 AND $daughter2: | |
if ($daughter1 ne '' && $daughter2 ne '') | |
{ | |
&compare_goids($daughter1,$daughter2,$node_name,$family_id,$daughter1_name,$daughter2_name); | |
} | |
} | |
} | |
} | |
NEXT_FAMILY: | |
} | |
} | |
#------------------------------------------------------------------# | |
# COMPARE THE GO IDS FOR $daughter1 AND $daughter2: | |
sub compare_goids | |
{ | |
my $daughter1 = $_[0]; | |
my $daughter2 = $_[1]; | |
my $node_name = $_[2]; | |
my $family_id = $_[3]; | |
my $daughter1_name = $_[4]; | |
my $daughter2_name = $_[5]; | |
my @ids1; | |
my @ids2; | |
my %ID1 = (); | |
my $i; | |
my $same = 1; | |
my %SEEN = (); | |
$daughter1 = substr($daughter1,1,length($daughter1)-1); | |
$daughter2 = substr($daughter2,1,length($daughter2)-1); | |
@ids1 = split(/\,/,$daughter1); | |
@ids2 = split(/\,/,$daughter2); | |
%SEEN = (); | |
$daughter1 = ""; | |
for ($i = 0; $i <= $#ids1; $i++) | |
{ | |
if (!($SEEN{$ids1[$i]})) { $daughter1 = $daughter1.",".$ids1[$i]; $SEEN{$ids1[$i]} = 1;} | |
} | |
$daughter1 = substr($daughter1,1,length($daughter1)-1); | |
%SEEN = (); | |
$daughter2 = ""; | |
for ($i = 0; $i <= $#ids2; $i++) | |
{ | |
if (!($SEEN{$ids2[$i]})) { $daughter2 = $daughter2.",".$ids2[$i]; $SEEN{$ids2[$i]} = 1;} | |
} | |
$daughter2 = substr($daughter2,1,length($daughter2)-1); | |
@ids1 = &Avril_modules::empty_array(@ids1); | |
@ids2 = &Avril_modules::empty_array(@ids2); | |
@ids1 = split(/\,/,$daughter1); | |
@ids2 = split(/\,/,$daughter2); | |
for ($i = 0; $i <= $#ids1; $i++) { $ID1{$ids1[$i]} = 1;} | |
for ($i = 0; $i <= $#ids2; $i++) { if (!($ID1{$ids2[$i]})) { $same = 0;}} | |
if ($same == 0) | |
{ | |
print "$family_id node $node_name : daughter1 $daughter1 ($daughter1_name) and daughter2 $daughter2 ($daughter2_name) have different GO ids! ***\n"; | |
} | |
else | |
{ | |
print "$family_id node $node_name : daughter1 $daughter1 and daughter2 $daughter2 have same GO ids.\n"; | |
} | |
} | |
#------------------------------------------------------------------# | |
# READ IN THE INPUT FILE OF GO TERMS: | |
sub read_go | |
{ | |
my $file = $_[0]; | |
my $TAKE = $_[1]; | |
my $line; | |
my @temp; | |
my $family; | |
my $species; | |
my $goid; | |
my %GOID = (); | |
my %SEEN = (); | |
my @families; | |
my $name; | |
open(FILE,"$file") || die "ERROR: cannot open $file.\n"; | |
while(<FILE>) | |
{ | |
$line = $_; | |
chomp $line; | |
@temp = split(/\s+/,$line); | |
if (substr($line,0,6) eq 'Family') # Family TF101001 : DM CYCB has GO id GO:0000910 | |
{ | |
$family = $temp[1]; # eg. TF101001 | |
$species = $temp[3]; # eg. DM | |
$goid = $temp[8]; # eg. GO:0000910 | |
$name = $temp[9]; # eg. CG3510-RA.3 | |
if ($TAKE->{$family}) | |
{ | |
if (!($GOID{$name})) { $GOID{$name} = $goid;} | |
else {$GOID{$name} = $GOID{$name}.",".$goid;} | |
if ($VERBOSE2 == 1) | |
{ | |
print "Family $family gene $name GO $GOID{$name}\n"; | |
} | |
if (!($SEEN{$family})) | |
{ | |
$SEEN{$family} = 1; | |
@families = (@families,$family); | |
} | |
} | |
} | |
} | |
close(FILE); | |
return(\%GOID,\@families); | |
} | |
#------------------------------------------------------------------# | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment