Created
March 1, 2013 16:19
-
-
Save avrilcoghlan/5065737 to your computer and use it in GitHub Desktop.
Perl script to read a Newick tree file from TreeFam, and print out the orthologs of a particular input gene based on the tree.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/perl | |
#---------------------------------------------------------------------------------------------# | |
# | |
# A Perl script to read a Newick tree file from Treefam, and | |
# print out the orthologues of a particular input gene. | |
# | |
# Avril Coghlan. 6-Dec-04. | |
# [email protected] | |
# | |
# For the Treefam project. | |
# | |
# The command-line format id: | |
# % perl <get_orths_from_newick_tree.pl> <tree> <mygene> | |
# where tree is the tree file, | |
# mygene is the gene of interest. | |
# | |
#---------------------------------------------------------------------------------------------# | |
# CHECK IF THERE ARE THE CORRECT NUMBER OF COMMAND-LINE ARGUMENTS: | |
$num_args = $#ARGV + 1; | |
if ($num_args != 2) | |
{ | |
print "Usage of get_orths_from_newick_tree.pl\n\n"; | |
print "perl get_orths_from_newick_tree.pl <tree> <mygene>\n"; | |
print "where <tree> is the tree file,\n"; | |
print " <mygene> is the gene of interest.\n"; | |
print "For example, >perl get_orths_from_newick_tree.pl /nfs/treefam/work/families/TF300307/full.nhx\n"; | |
print "KO1C8.10.1_CAEEL\n"; | |
exit; | |
} | |
# FIND THE NAME OF THE INPUT TREE FILE: | |
$tree_file = $ARGV[0]; | |
# FIND THE NAME OF THE GENE OF INTEREST: | |
$mygene = $ARGV[1]; | |
#------------------------------------------------------------------# | |
# PARAMETERS: | |
$PRINT_BRACKET_DETAILS = 0; # PRINTS OUT EXTRA DETAILS OF THE RUN. | |
$PRINT_FORK_DETAILS = 0; | |
# READ IN THE NEWICK TREE: | |
$tree = read_newick_tree($tree_file); | |
# FIND THE BIFURCATIONS IN THE TREE: | |
&find_tree_bifurcations($tree); | |
# GO THROUGH EACH OF THE BIFURCATIONS AND SEE WHICH BRANCHES THEY SPLIT INTO. | |
# EACH BIFURCATION IS A NODE WHICH SPLITS TO TWO THINGS, EITHER TWO MORE | |
# NODES, OR ONE SEQUENCE AND ONE NODE, OR TWO SEQUENCES. | |
# EACH NODE OR SEQUENCE HAS A PARENT NODE FROM WHICH IT SPLIT OFF, | |
# AND EACH PARENT NODE HAS TWO CHILD NODES. | |
&find_what_nodes_split_into; | |
# FIND THE ORTHOLOGUES FOR EACH SEQUENCE IN THE TREE: | |
&find_orthologues; | |
print STDERR "FINISHED.\n"; | |
#---------------------------------------------------------------------------------------------# | |
sub find_connecting_nodes | |
{ | |
# FIND THE NODES THAT CONNECT TO A CERTAIN INPUT NODE: | |
my $node = $_[0]; | |
my @temp; | |
my @nodes; | |
my $connecting; | |
if ($PARENT{$node}) | |
{ | |
$connecting = $PARENT{$node}; | |
if ($connecting ne $node && !($SEEN{$connecting})) | |
{ | |
@nodes = (@nodes,$connecting); | |
} | |
} | |
if ($node =~ /NODE/) { @temp = split(/NODE/,$node); $node = $temp[1];} | |
if ($SPLITS_INTO{$node}) | |
{ | |
$connecting = $SPLITS_INTO{$node}; | |
@temp = split(/\,/,$connecting); | |
if ($temp[0] ne $node && !($SEEN{$temp[0]})) | |
{ | |
@nodes = (@nodes,$temp[0]); | |
} | |
if ($temp[1] ne $node && !($SEEN{$temp[1]})) | |
{ | |
@nodes = (@nodes,$temp[1]); | |
} | |
} | |
return @nodes; | |
} | |
#---------------------------------------------------------------------------------------------# | |
sub check_if_node_above_outgroup | |
{ | |
# CHECK IF A CERTAIN NODE IS ABOVE THE OUTGROUP: | |
my $node = $_[0]; | |
my @seqs; | |
my $no_seqs; | |
my $i; | |
my $node_above_outgroup = 0; | |
# FIND THE SEQUENCES IN THE CLADE BELOW $node: | |
@seqs = find_seqs_in_clade($node); | |
$no_seqs = $#seqs + 1; | |
for ($i = 0; $i < $no_seqs; $i++) | |
{ | |
if ($seqs[$i] =~ /YEAST/ || $seqs[$i] =~ /SCHPO/ || $seqs[$i] =~ /ARATH/) | |
{ | |
$node_above_outgroup = 1; | |
} | |
} | |
return $node_above_outgroup; | |
} | |
#---------------------------------------------------------------------------------------------# | |
sub check_if_node_is_outgroup | |
{ | |
# CHECK IF A CERTAIN NODE IS ONLY ABOVE OUTGROUP SEQUENCES: | |
my $node = $_[0]; | |
my @seqs; | |
my $no_seqs; | |
my $i; | |
my $node_is_outgroup = 1; | |
# FIND THE SEQUENCES IN THE CLADE BELOW $node: | |
@seqs = find_seqs_in_clade($node); | |
$no_seqs = $#seqs + 1; | |
for ($i = 0; $i < $no_seqs; $i++) | |
{ | |
if (!($seqs[$i] =~ /YEAST/ || $seqs[$i] =~ /SCHPO/ || $seqs[$i] =~ /ARATH/)) | |
{ | |
$node_is_outgroup = 0; | |
} | |
} | |
return $node_is_outgroup; | |
} | |
#---------------------------------------------------------------------------------------------# | |
sub find_orthologues | |
{ | |
# GO THROUGH EACH SEQUENCE IN THE TREE AND FIND ITS ORTHOLOGUES: | |
my $no_sequences = $#sequences + 1; | |
my $i; | |
my $sequence; | |
my @nodes; | |
my $node; | |
my $found_outgroup; | |
my $no_nodes; | |
my $node1; | |
my $node2; | |
my $node1_above_outgroup; | |
my $node2_above_outgroup; | |
my $node1_is_outgroup; | |
my $node2_is_outgroup; | |
my @seqs; | |
my $no_seqs; | |
my $j; | |
my $found_orth; | |
for ($i = 0; $i < $no_sequences; $i++) | |
{ | |
$sequence = $sequences[$i]; | |
$found_orth = 0; | |
if ($sequence ne $mygene) { goto FOUND_OUTGROUP;} | |
if ($sequence =~ /\_YEAST/ || $sequence =~ /\_SCHPO/ || $sequence =~ /\_ARATH/) { goto FOUND_OUTGROUP;} | |
print "----------------------------------------------------------\n"; | |
print "FINDING ORTHOLOGUES OF $sequence ...\n"; | |
# FIND THE ORTHOLOGUES FOR $sequence IN THE TREE, BY MOVING UP THE TREE | |
# NODE BY NODE, UPWARDS FROM $sequence: | |
$node = $sequence; | |
$found_outgroup = 0; | |
%SEEN = (); | |
while($found_outgroup == 0) | |
{ | |
# FIND WHICH NODES ARE CONNECTED TO NODE $node; | |
$SEEN{$node} = 1; | |
@nodes = find_connecting_nodes($node); | |
# FIND WHAT SEQUENCES ARE BELOW EACH NODE THAT CONNECTS TO $node: | |
$no_nodes = $#nodes + 1; | |
if ($no_nodes == 1) | |
{ | |
$node = $nodes[0]; | |
} | |
elsif ($no_nodes == 2) | |
{ | |
# WE WANT TO CHECK WHETHER EACH OF THE NODES BELOW $node CONTAINS | |
# THE OUTGROUP OR NOT: | |
$node1 = $nodes[0]; | |
$node2 = $nodes[1]; | |
# CHECK IF $node1 OR $node2 CONTAINS ONLY OUTGROUP SEQUENCES: | |
$node1_is_outgroup = check_if_node_is_outgroup($node1); | |
$node2_is_outgroup = check_if_node_is_outgroup($node2); | |
# CHECK IF $node1 OR $node2 IS ABOVE THE OUTGROUP: | |
$node1_above_outgroup = check_if_node_above_outgroup($node1); | |
$node2_above_outgroup = check_if_node_above_outgroup($node2); | |
if ($node1_above_outgroup == 1 && $node2_above_outgroup == 0) | |
{ | |
# ALL THE SEQUENCES UNDER $node2 ARE ORTHOLOGUES OF $sequence: | |
$found_orth = 1; | |
@seqs = find_seqs_in_clade($node2); | |
$no_seqs = $#seqs + 1; | |
for ($j = 0; $j < $no_seqs; $j++) { print "$sequence ---> $seqs[$j]\n";} | |
$node = $node1; | |
} | |
elsif ($node1_above_outgroup == 0 && $node2_above_outgroup == 1) | |
{ | |
# ALL THE SEQUENCES UNDER $node1 ARE ORTHOLOGUES OF $sequence: | |
$found_orth = 1; | |
@seqs = find_seqs_in_clade($node1); | |
$no_seqs = $#seqs + 1; | |
for ($j = 0; $j < $no_seqs; $j++) { print "$sequence ---> $seqs[$j]\n";} | |
$node = $node2; | |
} | |
elsif ($node1_above_outgroup == 1 && $node2_above_outgroup == 1) | |
{ | |
# THE TREE HAS MORE THAN ONE OUTGROUP: | |
$found_outgroup = 1; | |
goto FOUND_OUTGROUP; | |
} | |
else | |
{ | |
# xxx NO OUTGROUP IN THE TREE - WE NEED TO FIX THIS. | |
print STDERR "ERROR: node1_above $node1_above_outgroup node2_above $node2_above_outgroup.\n"; | |
exit; | |
} | |
if ($node1_is_outgroup == 1 || $node2_is_outgroup == 1) { $found_outgroup = 1; goto FOUND_OUTGROUP;} | |
} | |
elsif ($no_nodes == 3) | |
{ | |
print STDERR "ERROR: 3 nodes below $node.\n"; exit; | |
} | |
else | |
{ | |
# IF THERE IS NO OUTGROUP IN THE TREE - WE NEED TO FIX THIS xxx | |
print STDERR "ERROR: $no_nodes below $node.\n"; exit; | |
} | |
} | |
FOUND_OUTGROUP: | |
if ($found_orth == 0 && $sequence eq $mygene) { print "Did not find any orthologue for $sequence.\n";} | |
if ($i == $#sequences) { goto END;} | |
} | |
END: | |
} | |
#---------------------------------------------------------------------------------------------# | |
sub find_seqs_in_clade | |
{ | |
my $node = $_[0]; | |
my @temp = split(/\s+/,$node); | |
if ($#temp > 0) { $node = $temp[1];} | |
my @nodes; | |
my $no_nodes; | |
my $i; | |
my $j; | |
my @seqs; | |
my @new_nodes; | |
my $no_new_nodes; | |
my $added = 0; | |
@nodes = (@nodes,$node); | |
my %CHECKED = (); | |
# CHECK IF $node IS A SEQUENCE: | |
if (!($node =~ /NODE/)) | |
{ | |
@seqs = (@seqs,$node); | |
return @seqs; | |
} | |
FIND_SEQS_UNDER_NODES: | |
$no_nodes = $#nodes + 1; | |
$added = 0; | |
for ($i = 0; $i < $no_nodes; $i++) | |
{ | |
# FIND ALL THE NODES THAT $nodes[$i] CONNECTS TO: | |
if ($CHECKED{$nodes[$i]}) { goto NEXT_NODEI;} | |
@new_nodes = find_connecting_nodes($nodes[$i]); | |
# GO THROUGH ALL THE NODES THAT CONNECT TO $nodes[$i]: | |
$no_new_nodes = $#new_nodes + 1; | |
for ($j = 0; $j < $no_new_nodes; $j++) | |
{ | |
if (!($new_nodes[$j] =~ /NODE/)) | |
{ | |
@seqs = (@seqs,$new_nodes[$j]); | |
} | |
else | |
{ | |
@nodes = (@nodes,$new_nodes[$j]); | |
$added = 1; | |
} | |
} | |
$CHECKED{$nodes[$i]} = 1; | |
NEXT_NODEI: | |
} | |
if ($added == 1) { goto FIND_SEQS_UNDER_NODES;} | |
return @seqs; | |
} | |
#---------------------------------------------------------------------------------------------# | |
# WE HAVE FOUND A COMMA IN THE NEWICK TREE. NOW LOOK TO THE RIGHT FOR THE NEAREST ) AND TO | |
# THE LEFT FOR THE NEAREST ( AND FIND THE REGION (CONTAINING ,) BRACKETED BY (). | |
sub find_bracketed_region | |
{ | |
my $i = $_[0]; # POSITION OF THE COMMA OF INTEREST. | |
my $len = $_[1]; # LENGTH OF THE WHOLE TREE. | |
my $tree = $_[2]; | |
my $j; | |
my $char; | |
my $bracket_start; | |
my $bracket_end; | |
my $bracket_len; | |
my $bracketed; | |
my $num_brackets; | |
my $x; | |
my $look_as_far; | |
my $bracket_num; | |
# THE INDEX OF THE COMMA IN THE NEWICK TREE IS CHARACTER $i. | |
#------------------------------------------------------------------------------------------# | |
# LOOK FOR THE INDEX OF THE FIRST ( BEFORE THE COMMA: | |
for ($j = $i; $j >= 0; $j--) | |
{ | |
$char = substr($tree,$j,1); | |
if ($char eq '(' && !($USED_BRACKET{$j})) | |
{ | |
$bracket_start = $j; # THE INDEX OF THE PREVIOUS ( BEFORE THE COMMA. | |
goto FOUND_STARTING_BRACKET; | |
} | |
} | |
FOUND_STARTING_BRACKET: | |
#------------------------------------------------------------------------------------------# | |
# LOOK FOR THE INDEX OF THE FIRST ) AFTER THE COMMA: | |
for ($j = $i; $j < $len; $j++) | |
{ | |
$char = substr($tree,$j,1); | |
if ($char eq ')' && !($USED_BRACKET{$j})) | |
{ | |
$bracket_end = $j; # THE INDEX OF THE NEXT ) AFTER THE COMMA. | |
goto FOUND_ENDING_BRACKET; | |
} | |
} | |
FOUND_ENDING_BRACKET: | |
#------------------------------------------------------------------------------------------# | |
# FIND THE LENGTH OF THE REGION ENCLOSED BY THE PARENTHESES (): | |
# (xxxx) | |
# 123456 | |
# eg., $bracket_len = 6 - 1 + 1 = 6, ie., $bracket_len INCLUDES THE (). | |
$bracket_len = $bracket_end - $bracket_start + 1; | |
if ($bracket_len < 0) | |
{ | |
goto COMMA_NOT_OKAY; | |
} | |
#------------------------------------------------------------------------------------------# | |
# FIND THE REGION ENCLOSED BY THE PARENTHESES: | |
$bracketed = substr($tree,$bracket_start,$bracket_len); | |
# COUNT THE NUMBER OF BRACKETS IN THE BRACKETED REGION: IF THERE ARE MORE )s THAN (s SOMETHING IS WRONG: | |
$num_brackets = 0; | |
for ($x = 0; $x < $bracket_len; $x++) | |
{ | |
$char = substr($bracketed,$x,1); | |
if ($char eq '(') { $num_brackets++;} | |
elsif ($char eq ')') { $num_brackets--;} | |
if ($num_brackets < 0) # THERE ARE MORE )s THAN (s, SOMETHING IS WRONG. | |
{ | |
goto COMMA_NOT_OKAY; | |
} | |
} | |
#------------------------------------------------------------------------------------------# | |
# THE BRACKETED REGION SHOULD NOT CONTAIN ANY BRACKETS THAT WE HAVE NOT ALREADY EXAMINED, AS | |
# WE WANT TO LOOK AT INTERNAL BIFURCATIONS, AND WORK UPWARDS TOWARDS THE ROOT OF THE TREE: | |
$look_as_far = $bracket_len - 1; # LOOK UP TO JUST BEFORE THE END ) BRACKET. | |
for ($x = 1; $x < $look_as_far; $x++) | |
{ | |
$char = substr($bracketed,$x,1); | |
if ($char eq '(' || $char eq ')') | |
{ | |
# FIND THE INDEX OF THIS BRACKET IN THE WHOLE TREE: | |
$bracket_num = $x + $bracket_start; | |
if (!($USED_BRACKET{$bracket_num})) # THERE IS AN INTERNAL BRACKET THAT HAS NOT YET BEEN EXAMINED. | |
{ | |
goto COMMA_NOT_OKAY; | |
} | |
} | |
} | |
#------------------------------------------------------------------------------------------# | |
# IF THERE IS AN EVEN NUMBER OF BRACKETS AND WE HAVE NOT ALREADY GOT THIS BRACKETED REGION, | |
# THEN ADD THIS BRACKETED REGION INTO OUR ARRAY @bracketeds OF BRACKETED REGIONS: | |
if (!($HAVE_BRACKETED{$bracketed}) && $num_brackets == 0) | |
{ | |
if ($PRINT_BRACKET_DETAILS == 1) | |
{ | |
print "The bracketed bit is $bracketed.\n"; | |
} | |
$USED_BRACKET{$bracket_start} = 1; | |
$USED_BRACKET{$bracket_end} = 1; | |
$USED_COMMA{$i} = 1; | |
$HAVE_BRACKETED{$bracketed} = 1; | |
(@bracketeds) = (@bracketeds,$bracketed); | |
} | |
COMMA_NOT_OKAY: | |
} | |
#---------------------------------------------------------------------------------------------# | |
# NIBBLE FORWARD AND BACK TO THE FIRST INSTANCES OF THE STRING $thechar. | |
sub nibble_to_characters | |
{ | |
my $name = $_[0]; | |
my $thechar = $_[1]; | |
if ($name =~ /$thechar/) | |
{ | |
$name =~ s/$thechar//g; | |
} | |
return $name; | |
} | |
#---------------------------------------------------------------------------------------------# | |
# TAKE OFF STARTING OR ENDING SINGLE CHARACTERS OF $thechar. | |
sub nibble_off_characters | |
{ | |
my $name = $_[0]; | |
my $thechar = $_[1]; | |
my $len; | |
my $index; | |
my $x; | |
my $char; | |
$len = length($name); | |
# TAKE OFF DASHES AT THE START: | |
$index = 0; | |
for ($x = 0; $x < $len; $x++) | |
{ | |
$char= substr($name,$x,1); | |
if ($char eq $thechar) { $index = $x + 1; } | |
else { goto FOUND_START;} | |
} | |
FOUND_START: | |
$name = substr($name,$index,$len-$index); | |
# TAKE OFF DASHES AT THE END: | |
$len = length($name); | |
$index = $len; | |
for ($x = $len-1; $x >= 0; $x--) | |
{ | |
$char= substr($name,$x,1); | |
if ($char eq $thechar) { $index = $x-1; } | |
else { goto FOUND_END; } | |
} | |
FOUND_END: | |
$name = substr($name,0,$index); | |
return $name; | |
} | |
#---------------------------------------------------------------------------------------------# | |
# GO THROUGH EACH OF THE BIFURCATIONS AND SEE WHICH BRANCHES THEY SPLIT INTO. | |
# EACH BIFURCATION IS A NODE WHICH SPLITS TO TWO THINGS, EITHER TWO MORE | |
# NODES, OR ONE SEQUENCE AND ONE NODE, OR TWO SEQUENCES. EACH NODE OR SEQUENCE | |
# HAS A PARENT NODE FROM WHICH IT SPLIT OFF, AND EACH PARENT NODE HAS TWO CHILD NODES. | |
sub find_what_nodes_split_into | |
{ | |
%SPLITS_INTO = (); # HASH TABLE TO RECORD WHICH NODES A NODE SPLITS INTO. | |
%PARENT = (); # HASH TABLE TO RECORD THE PARENT NODE OF EACH NODE. | |
%SISTER = (); # HASH TABLE TO RECORD THE SISTER NODE OF EACH NODE. | |
my %HAVE = (); | |
my %CNT = (); | |
%BRANCHLENGTH = (); # THE BRANCHLENGTH. | |
my $i; | |
my $num_bracketeds; | |
my $no_loops; | |
my $bracketed; | |
my @temp; | |
my $num_things; | |
my $first; | |
my $second; | |
my $branchlength; | |
my $last_letter; | |
my $first_one; | |
my $second_one; | |
my $inside_node; | |
my $inside; | |
my $x; | |
my $y; | |
my $start; | |
my $len; | |
my $j; | |
$num_bracketeds = $#bracketeds + 1; | |
$no_loops = 0; | |
for ($i = $num_bracketeds - 1; $i > -1; $i--) | |
{ | |
if ($PRINT_FORK_DETAILS == 1) {print "=========================================== \n";} | |
$bracketed = $bracketeds[$i]; | |
#---------------------------------------------------------------------------------------# | |
# SEE WHICH TWO THINGS THIS NODE SPLITS INTO: | |
RETRY: | |
$no_loops++; | |
if ($no_loops > 500) { print "ERROR: $tree_file: have looped $no_loops times now.\n"; exit;} | |
@temp = split(/,/,$bracketed); | |
$num_things = $#temp + 1; | |
if ($num_things == 2) # THIS NODE SPLITS UP INTO TWO SEQUENCES: | |
{ | |
$first = $temp[0]; | |
$second = $temp[1]; | |
$first = nibble_off_characters($first,"("); | |
$first = nibble_off_characters($first,"_"); | |
@temp = split(/:/,$first); | |
if ($temp[0] =~ /[A-Z]/ || $temp[0] =~ /[a-z]/) { $first = $temp[0]; $branchlength = $temp[1];} | |
elsif ($temp[1] =~ /[A-Z]/ || $temp[1] =~ /[a-z]/) { $first = $temp[1]; $branchlength = $temp[0];} | |
$first =~ tr/[a-z]/[A-Z]/; | |
$last_letter = substr($first,length($first)-1,1); | |
if ($last_letter =~ /[A-Z]/ && !($first =~ /_/)) { chop($first);} | |
@temp = split(/\s+/,$first); | |
if ($#temp > 0) { $first = $temp[1];} | |
# TAKE OFF ENDING CHARACTERS THAT ARE NOT NUMBERS: | |
if ($first =~ /NODE/) | |
{ | |
$first = nibble_off_letters($first); | |
} | |
if ($HAVE{$first}) { if (!($CNT{$first})){$CNT{$first} = 2;} else { $CNT{$first}++;} $first = $first.$CNT{$first};} | |
$HAVE{$first}= 1; | |
$BRANCHLENGTH{$first}= $branchlength; | |
$second = nibble_off_characters($second,")"); | |
$second = nibble_off_characters($second,"_"); | |
@temp = split(/:/,$second); | |
if ($temp[0] =~ /[A-Z]/ || $temp[0] =~ /[a-z]/) { $second = $temp[0]; $branchlength = $temp[1];} | |
elsif ($temp[1] =~ /[A-Z]/ || $temp[1] =~ /[a-z]/) { $second = $temp[1]; $branchlength = $temp[0];} | |
$second =~ tr/[a-z]/[A-Z]/; | |
$last_letter = substr($second,length($second)-1,1); | |
if ($last_letter =~ /[A-Z]/ && !($second =~ /_/)) { chop($second);} | |
@temp = split(/\s+/,$second); | |
if ($#temp > 0) { $second = $temp[1];} | |
# TAKE OFF ENDING CHARACTERS THAT ARE NOT NUMBERS: | |
if ($second =~ /NODE/) | |
{ | |
$second = nibble_off_letters($second); | |
} | |
if ($HAVE{$second}) { if (!($CNT{$second})){$CNT{$second} = 2;} else { $CNT{$second}++;} $second = $second.$CNT{$second};} | |
$HAVE{$second} = 1; | |
$BRANCHLENGTH{$second} = $branchlength; | |
$first_one = $first; | |
$second_one = $second; | |
$SPLITS_INTO{$i} = $first_one.",".$second_one; # NODE $i SPLITS INTO $first_one AND $second_one. | |
$PARENT{$first_one} = "NODE".$i; | |
$PARENT{$second_one} = "NODE".$i; | |
$SISTER{$first_one} = $second_one; | |
$SISTER{$second_one} = $first_one; | |
$BRANCHLENGTH{$first_one} = $BRANCHLENGTH{$first}; | |
$BRANCHLENGTH{$second_one}= $BRANCHLENGTH{$second}; | |
if ($PRINT_FORK_DETAILS == 1) | |
{ | |
print "Node i $i ($PARENT{$first_one}) splits into $first_one ($BRANCHLENGTH{$first_one}) and $second_one ($BRANCHLENGTH{$second_one})\n"; | |
} | |
#---------------------------------------------------------------------------------------------# | |
# CHECK WHETHER THE NODE SPLITS INTO SEQUENCES OR INTO MORE NODES: | |
if (!($first =~ /node/) && !($first =~ /NODE/)) | |
{ | |
@sequences = (@sequences,$first); | |
} | |
if (!($second =~ /node/) && !($second =~ /NODE/)) | |
{ | |
@sequences = (@sequences,$second); | |
} | |
} # END OF IF THIS NODE SPLITS UP INTO TWO SEQUENCES. | |
else # THERE ARE OTHER NODES INSIDE THIS NODE. | |
{ | |
# CHECK IF ANY OTHER NODES ARE INSIDE THIS NODE, AND IF THERE ARE, | |
# DELETE THEM FROM THE OUTER BRACKETED REGION. | |
for ($j = $num_bracketeds-1; $j >=0; $j--) | |
{ | |
if ($j != $i) | |
{ | |
$inside_node= $bracketeds[$j]; | |
# CHECK WHETHER $inside_node IS WITHIN $bracketed: | |
$inside = index($bracketed,$inside_node); | |
if ($inside != -1) # $inside_node IS WITHIN $bracketed: | |
{ | |
# FIND THE LENGTH OF $inside_node (WE CANNOT USE THE length() FUNCTION AS BRACKETS CONFUSE IT): | |
$x = 0; | |
while(substr($inside_node,$x,1) ne '') | |
{ | |
$x++; | |
} | |
# FIND THE LENGTH OF $bracketed: | |
$y = 0; | |
while(substr($bracketed,$y,1) ne '') | |
{ | |
$y++; | |
} | |
# FIND THE START OF $inside_node IN $bracketed: | |
$start = $x + $inside; | |
# FIND THE LENGTH OF $inside_node IN $bracketed: | |
$len = $y - $x - ($inside - 1); | |
# MAKE $bracketed BE A SUBSTRING OF ITSELF WITH $inside_node DELETED: | |
$bracketed = substr($bracketed,0,$inside)."NODE$j".substr($bracketed,$start,$len); | |
} | |
} | |
} | |
goto RETRY; | |
} # END OF IF THERE ARE OTHER NODES INSIDE THIS NODE. | |
if ($i == 0) { goto JUMP_OUT_OF_LOOP;} | |
} # END OF LOOKING THROUGH EACH OF THE BIFURCATIONS TO SEE WHAT THEY SPLIT INTO. | |
JUMP_OUT_OF_LOOP: | |
} | |
#---------------------------------------------------------------------------------------------# | |
sub nibble_off_letters | |
{ | |
# REMOVE ANY LETTERS AT THE END OF A STRING: | |
my $string = $_[0]; | |
my $new = ""; | |
my @temp; | |
my $i; | |
my $char; | |
@temp = split(/NODE/,$string); | |
$string = $temp[1]; | |
$length = length($string); | |
for ($i = 1; $i <= $length; $i++) | |
{ | |
$char = substr($string,$i-1,1); | |
if ($char =~ /[0-9]/) | |
{ | |
$new = $new.$char; | |
} | |
else | |
{ | |
$new = "NODE".$new; | |
return $new; | |
} | |
} | |
$new = "NODE".$new; | |
return $new; | |
} | |
#---------------------------------------------------------------------------------------------# | |
# READ IN THE NEWICK TREE: | |
sub read_newick_tree | |
{ | |
my $tree_file = $_[0]; | |
my $line; | |
my $tree; | |
open(TREE,"$tree_file") || die "Cannot open $tree_file"; | |
$tree = ''; | |
while(<TREE>) | |
{ | |
$line = $_; | |
chomp($line); | |
$tree = $tree.$line; | |
} | |
close(TREE); | |
return $tree; | |
} | |
#---------------------------------------------------------------------------------------------# | |
# FIND THE BIFURCATIONS IN THE TREE: | |
sub find_tree_bifurcations | |
{ | |
# THE TREE IS STORED IN A VARIABLE $tree: | |
my $tree = $_[0]; | |
my $len; | |
my $char; | |
my $round; | |
%USED_BRACKET = (); # HASH TABLE TO SAY WHETHER WE HAVE EXAMINED A PARTICULAR BRACKET YET. | |
%USED_COMMA = (); # HASH TABLE TO SAY WHETHER WE HAVE EXAMINED A PARTICULAR COMMA YET. | |
%HAVE_BRACKETED = (); | |
my $i; | |
$len = length($tree); | |
$char = ''; | |
$round = 0; | |
#------------------------------------------------------------------------------------------# | |
RESTART: | |
$round++; | |
if ($PRINT_BRACKET_DETAILS == 1) | |
{ | |
print "\n======================================================\n"; | |
print "ROUND $round of finding tree bifurcations:\n\n"; | |
} | |
if ($round > 100) { print STDERR "ERROR: $tree_file: have reached round $round.\n\n"; exit;} | |
# GO THROUGH THE TREE TO FIND COMMAS, AS COMMAS ARE BIFURCATIONS IN THE TREE. WE JUST | |
# GO THROUGH THE TREE FROM LEFT TO RIGHT: | |
for ($i = 0; $i < $len; $i++) | |
{ | |
$char = substr($tree,$i,1); | |
if ($char eq ',') # THERE IS A BIFURCATION HERE. | |
{ | |
# LOOK TO THE RIGHT OF THE , FOR THE NEAREST ) AND TO THE LEFT OF THE | |
# , FOR THE NEAREST ( AND FIND THE REGION BRACKETED BY THESE BRACKETS, ie., (reg,ion), WHERE | |
# THE REGION CONTAINS THE COMMA: | |
if (!($USED_COMMA{$i})) # IF WE HAVE NOT FOUND THE REGION AROUND THIS COMMA SO FAR: | |
{ | |
&find_bracketed_region($i,$len,$tree); | |
} | |
} | |
} | |
#..........................................................................................# | |
# NOW WE HAVE LOOKED THROUGH THE TREE FROM LEFT TO RIGHT LOOKING FOR COMMAS. WE MAY NOT | |
# HAVE FULLY EXAMINED ALL COMMAS YET HOWEVER, AS SOME MAY HAVE BEEN REJECTED IN THE | |
# SUBROUTINE find_bracketed_region. SO NOW WE SEARCH FOR ANY COMMAS (BIFURCATIONS) LEFT | |
# UNTIL ALL COMMAS HAVE BEEN ANALYSED: | |
for ($i = 0; $i < $len; $i++) | |
{ | |
$char = substr($tree,$i,1); | |
if ($char eq ')' || $char eq '(') | |
{ | |
if (!($USED_BRACKET{$i})) # THERE IS A BRACKET (AND SO A COMMA) WHICH HAS NOT BE ANALYSED YET. | |
{ | |
goto RESTART; | |
} | |
} | |
} | |
#------------------------------------------------------------------------------------------# | |
} | |
#---------------------------------------------------------------------------------------------# | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment