Created
March 1, 2013 15:12
-
-
Save avrilcoghlan/5065269 to your computer and use it in GitHub Desktop.
Perl script that, given a list of Caenorhabditis elegans paralog pairs, uses the TreeFam tree that they are in to calculate information about the paralogs and the tree
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#! /usr/bin/perl | |
# | |
# Perl script count_worm_paralogs2b.pl | |
# Written by Avril Coghlan ([email protected]) | |
# 1-Oct-08. | |
# | |
# This perl script reads in the trees in TreeFam-6 that a pair of | |
# worm paralogs are in, and calculates information about the paralogs | |
# and the tree. | |
# | |
# The command-line format is: | |
# % perl <count_worm_paralogs2.pl> <list> <taxa> | |
# where <list> is the list of paralog pairs, | |
# <taxa> is the list of taxa in which to count duplications/speciations. | |
# | |
#------------------------------------------------------------------# | |
# CHECK IF THERE ARE THE CORRECT NUMBER OF COMMAND-LINE ARGUMENTS: | |
$num_args = $#ARGV + 1; | |
if ($num_args != 2) | |
{ | |
print "Usage of count_worm_paralogs2b.pl\n\n"; | |
print "perl count_worm_paralogs2b.pl <list> <taxa>\n"; | |
print "where <list> is the list of paralog pairs,\n"; | |
print " <taxa> is the list of taxa in which to count duplications/speciations\n"; | |
print "For example, >perl count_worm_paralogs2b.pl ancient_paralogs treefam56_species\n"; | |
exit; | |
} | |
# READ IN MY PERL MODULES: | |
BEGIN { | |
unshift (@INC, '/nfs/team71/phd/alc/perl/modules'); | |
} | |
use lib "/nfs/team71/phd/alc/perl/treefam/Treefam_api_3may07/"; # xxx make sure it points to treefam-6 | |
use Avril_modules; | |
use Treefam::DBConnection; | |
use DBI; | |
# FIND THE LIST OF PARALOG PAIRS: | |
$list = $ARGV[0]; | |
# FIND THE LIST OF TAXA IN WHICH TO COUNT SPECIATIONS/DUPLICATIONS: | |
$taxa = $ARGV[1]; | |
#------------------------------------------------------------------# | |
# IF $VERBOSE == 1, PRINT OUT EXTRA INFORMATION: | |
$VERBOSE = 0; | |
#------------------------------------------------------------------# | |
$version = "6"; # xxx | |
$db = "dbi:mysql:treefam_".$version.":db.treefam.org:3308"; | |
$dbh = DBI->connect("$db", 'anonymous', '') || return; | |
# READ IN THE LIST OF TAXA IN WHICH TO COUNT SPECIATIONS/DUPLICATIONS: | |
%TAKE = (); | |
open(TAXA,"$taxa") || die "ERROR: cannot open $taxa\n"; | |
while(<TAXA>) | |
{ | |
$line = $_; | |
chomp $line; | |
@temp = split(/_/,$line); | |
$taxid = $temp[0]; # eg. 10090 | |
$taxname = $temp[1]; # eg. Mus musculus | |
# SEE IF THERE IS A SWCODE FOR THIS $taxid: | |
$table_w = 'species'; | |
$st = "SELECT SWCODE from $table_w WHERE TAX_ID=?"; | |
$sth = $dbh->prepare($st) or die "Cannot prepare $st: $dbh->errstr\n"; | |
$rv = $sth->execute($taxid) or die "Cannot execute the query: $sth->errstr"; | |
$SWCODE = "none"; | |
if ($rv >= 1) | |
{ | |
while ((@array) = $sth->fetchrow_array) | |
{ | |
$SWCODE = $array[0]; | |
} | |
} | |
$taxname =~ tr/[a-z]/[A-Z]/; | |
$SWCODE =~ tr/[a-z]/[A-Z]/; | |
# WE WANT TO IGNORE SPECIATIONS GIVING RISE TO THE DROSOPHILA SPECIES OR PLATYPUS, SINCE THESE | |
# SPECIATIONS ARE NOT SHARED BETWEEN TREEFAM5/6. | |
# KEEP THERIA, DISCARD MAMMALIA. THIS IS BECAUSE THE BRANCH OFF TO PLATYPUS IS USUALLY MAMMALIA. | |
# SOPHOPHORA AND DROSOPHILA SPECIATIONS OCCUR IN EXTRA SPECIATIONS ONLY FOUND IN TREEFAM-6. | |
if ($SWCODE eq 'NONE' && $taxname ne 'SOPHOPHORA' && $taxname ne 'MAMMALIA' && $taxname ne 'DROSOPHILA') | |
{ | |
$TAKE{$taxname} = 1; # eg. HOMO/PAN/GORILLA GROUP | |
@temp = split(/\s+/,$taxname); | |
$taxname = $temp[0]; # eg. HOMO/PAN/GORILLA | |
$TAKE{$taxname} = 1; | |
} | |
else | |
{ | |
$TAKE{$SWCODE} = 1; | |
} | |
} | |
close(TAXA); | |
#------------------------------------------------------------------# | |
# READ IN THE GENES IN THE LIST OF PARALOG PAIRS: | |
%SEEN_GENE = (); | |
open(LIST,"$list") || die "ERROR: cannot open $list\n"; | |
while(<LIST>) | |
{ | |
$line = $_; | |
chomp $line; | |
@temp = split(/\s+/,$line); | |
$gene1 = $temp[0]; | |
$gene2 = $temp[1]; | |
if (!($SEEN_GENE{$gene1})) { @genes = (@genes,$gene1); $SEEN_GENE{$gene1} = 1;} | |
if (!($SEEN_GENE{$gene2})) { @genes = (@genes,$gene2); $SEEN_GENE{$gene2} = 1;} | |
} | |
close(LIST); | |
$dbh = DBI->connect("dbi:mysql:treefam_6:db.treefam.org:3308", 'anonymous', '') || return; | |
#------------------------------------------------------------------# | |
%GENES = (); # HASH TABLE TO RECORD THE PARALOGS THAT FAMILIES CONTAIN. | |
%FAMILY = (); | |
%GID = (); | |
# FOR EACH GENE IN THE LIST OF GENES, FIND OUT WHAT FAMILY IT IS IN: | |
for ($i = 0; $i <= $#genes; $i++) | |
{ | |
$gene = $genes[$i]; | |
# FIND OUT WHAT IS THE TREEFAM ID FOR THIS GENE: | |
$table_w = "genes"; | |
$st = "SELECT ID,GID from $table_w WHERE TID=?"; | |
$sth = $dbh->prepare($st) or die "Cannot prepare $st: $dbh->errstr\n"; | |
$rv = $sth->execute($gene) or die "Cannot execute the query: $sth->errstr"; | |
$ID = "none"; | |
$GID = "none"; | |
if ($rv >= 1) | |
{ | |
while ((@array) = $sth->fetchrow_array) | |
{ | |
$ID = $array[0]; | |
$GID = $array[1]; | |
} | |
} | |
# SOME CASES ARE STORED IN TREEFAM-6 UNDER ISOFORM NAMES: | |
if ($gene eq 'F22D6.3') { $ID = 'F22D6.3a'; $GID = 'WBGene00003815';} | |
elsif ($gene eq 'C24B5.2') { $ID = 'C24B5.2a'; $GID = 'WBGene00016045';} | |
elsif ($gene eq 'Y47D9A.1'){ $ID = 'Y47D9A.1a'; $GID = 'WBGene00021628';} | |
elsif ($gene eq 'F25B5.4') { $ID = 'F25B5.4a'; $GID = 'WBGene00006727';} | |
elsif ($gene eq 'F54H12.1'){ $ID = 'F54H12.1a'; $GID = 'WBGene00000041';} | |
elsif ($gene eq 'Y77E11A.13'){$ID= 'Y77E11A.13a'; $GID = 'WBGene00003806';} | |
elsif ($gene eq 'Y17G7B.5'){ $ID = 'Y17G7B.5a'; $GID = 'WBGene00003154';} | |
elsif ($gene eq 'T10B5.5') { $ID = 'T10B5.5a'; $GID = 'WBGene00020391';} | |
elsif ($gene eq 'C07G2.3') { $ID = 'C07G2.3a'; $GID = 'WBGene00000380';} | |
elsif ($gene eq 'T19A6.2') { $ID = 'T19A6.2a'; $GID = 'WBGene00003596';} | |
if ($ID eq 'none' || $GID eq 'none') { print STDERR "ERROR: did not find ID for gene $gene\n"; exit;} | |
if ($GID{$gene}) { print STDERR "ERROR: already have GID for $gene\n"; exit;} | |
$GID{$gene} = $GID; | |
# FIND OUT WHAT FAMILY THIS GENE BELONGS TO: | |
$table_w = "fam_genes"; | |
$st = "SELECT AC from $table_w WHERE ID=?"; | |
$sth = $dbh->prepare($st) or die "Cannot prepare $st: $dbh->errstr\n"; | |
$rv = $sth->execute($ID) or die "Cannot execute the query: $sth->errstr"; | |
$ID = "none"; | |
$the_AC = "none"; | |
if ($rv >= 1) | |
{ | |
while ((@array) = $sth->fetchrow_array) | |
{ | |
$AC = $array[0]; | |
if ($the_AC eq 'none') { $the_AC = $AC;} | |
elsif ($the_AC ne 'none') | |
{ | |
if ($AC =~ /TF1/ && ($the_AC =~ /TF3/ || $the_AC =~ /TF5/)) { $the_AC = $AC;} | |
elsif ($AC =~ /TF3/ && $the_AC =~ /TF5/) { $the_AC = $AC;} | |
} | |
} | |
} | |
if ($the_AC eq 'none') { print STDERR "ERROR: did not find family for gene $gene ID $ID\n"; exit;} | |
# RECORD THE PARALOGS IN THIS FAMILY: | |
if ($GID eq 'none') { print STDERR "ERROR: gene $gene GID $GID\n"; exit;} | |
if (!($GENES{$the_AC})) { $GENES{$the_AC} = $GID; } | |
else { $GENES{$the_AC} = $GENES{$the_AC}.",".$GID;} | |
# RECORD THE FAMILY FOR THIS GENE: | |
if (!($FAMILY{$gene})) { $FAMILY{$gene} = $the_AC;} | |
else { print STDERR "ERROR: already have family for $gene\n"; exit;} | |
} | |
print "FAMILY SPECIES GENEI GENEJ GENES SPECIESGENES DUPNODES SPECNNODES GENESCLADE SPECIESGENESCLADE DUPNODESCLADE SPECNODESCLADE DUPNODESI SPECNODESI DUPNODESJ SPECNODESJ DUPNODESIROOT SPECNODESIROOT DUPNODESJROOT SPECNODESJROOT SPECIATIONS SPECIATIONSI SPECIATIONSJ SPECIATIONSIROOT SPECIATIONSJROOT\n"; | |
$dbc = Treefam::DBConnection->new(); | |
%NUM_GENES_IN_TREE = (); | |
%NUM_SPECIES_GENES_IN_TREE = (); | |
%NUM_DUP_NODES_IN_TREE = (); | |
%NUM_SPEC_NODES_IN_TREE = (); | |
%SPECIATIONS_IN_TREE = (); | |
%NUM_DUP_NODES_I = (); | |
%NUM_SPEC_NODES_I = (); | |
%SPECIATIONS_I = (); | |
# LOOK AT PARALOGS WITHIN EACH FAMILY: | |
foreach $treefam_family (keys %GENES) | |
{ | |
print STDERR "Looking at family $treefam_family... \n"; | |
$famh = $dbc->get_FamilyHandle(); | |
if (!(defined($famh->get_by_id($treefam_family)))) | |
{ | |
print "WARNING: there is no family $treefam_family...\n"; | |
goto NEXT_FAMILY; | |
} | |
$family = $famh->get_by_id($treefam_family); | |
if (!(defined($family->ID()))) | |
{ | |
print "WARNING: do not have ID stored for family $treefam_family...\n"; | |
goto NEXT_FAMILY; | |
} | |
$AC = $family->ID(); # GET THE FAMILY ID. | |
if ($AC ne $treefam_family) { print STDERR "ERROR: AC $AC treefam_family $treefam_family\n"; exit;} | |
$no_families++; | |
print "\n$no_families: reading family $AC...\n"; | |
# GET THE TREE FOR THE FAMILY: | |
if (!(defined($family->get_tree('clean')))) | |
{ | |
print "WARNING: AC $AC: there is no tree!\n"; | |
goto NEXT_FAMILY; | |
} | |
$tree = $family->get_tree('clean'); # GET THE TREEFAM CLEAN TREE. | |
@nodes = $tree->get_all_nodes(); | |
#---------------------------------------------------------------------------------# | |
# COUNT HOW MANY GENES, AND HOW MANY GENES FROM C. ELEGANS, ARE IN THE WHOLE TREE: | |
$num_genes_in_tree = 0; | |
$num_species_genes_in_tree = 0; | |
%SEEN_GENE = (); | |
foreach $node(@nodes) | |
{ | |
if (defined($node->sequence_id())) # IF IT IS A LEAF. | |
{ | |
# FIND THE SPECIES FOR THIS TRANSCRIPT: | |
if (defined($node->get_tag_value('S'))) | |
{ | |
$species = $node->get_tag_value('S'); | |
} | |
else | |
{ | |
print "WARNING: AC $AC: do not know species of node\n"; | |
$species = "none"; | |
} | |
$species =~ tr/[a-z]/[A-Z]/; | |
# SEE IF THIS IS FROM A SPECIES THAT WE ARE INCLUDING: | |
if ($TAKE{$species}) | |
{ | |
# FIND THE GENE FOR THIS TRANSCRIPT: | |
$gh = $node->gene; | |
$gene = $gh->ID(); # eg. ENSMUSG00000022770 | |
if (!($SEEN_GENE{$gene})) | |
{ | |
$num_genes_in_tree++; | |
if ($species eq 'CAEEL') { $num_species_genes_in_tree++;} | |
$SEEN_GENE{$gene} = 1; | |
} | |
} | |
else | |
{ | |
if ($VERBOSE == 1) { print "treefam_family $treefam_family excluding $species gene\n";} | |
} | |
} | |
} | |
if ($NUM_GENES_IN_TREE{$treefam_family}) { print STDERR "ERROR: already have #genes in $treefam_family\n"; exit;} | |
if ($NUM_SPECIES_GENES_IN_TREE{$treefam_family}) { print STDERR "ERROR: already have #CAEEL genes in $treefam_family\n"; exit;} | |
$NUM_GENES_IN_TREE{$treefam_family} = $num_genes_in_tree; | |
$NUM_SPECIES_GENES_IN_TREE{$treefam_family} = $num_species_genes_in_tree; | |
#---------------------------------------------------------------------------------# | |
# COUNT HOW MANY DUPLICATION/SPECIATION NODES ARE IN THE WHOLE TREE: | |
$num_dup_nodes_in_tree = 0; | |
$num_spec_nodes_in_tree = 0; | |
foreach $node(@nodes) | |
{ | |
if (!(defined($node->sequence_id()))) # IF IT IS NOT A LEAF | |
{ | |
# FIND THE SPECIES FOR THIS NODE: | |
if (defined($node->get_tag_value('S'))) | |
{ | |
$species = $node->get_tag_value('S'); | |
} | |
else | |
{ | |
print "WARNING: treefam_family $treefam_family: do not know species of node\n"; | |
$species = "none"; | |
} | |
$species =~ tr/[a-z]/[A-Z]/; | |
# IF THIS IS A SPECIES WE WANT TO TAKE: | |
if ($TAKE{$species}) | |
{ | |
if (defined($node->get_tag_value('D'))) | |
{ | |
$node_type = $node->get_tag_value('D'); # Y FOR DUPLICATION, N FOR SPECIATION | |
# IF IT IS A DUPLICATION NODE (Y) | |
if ($node_type eq 'Y') { $num_dup_nodes_in_tree++; } | |
elsif ($node_type eq 'N') # IT IS A SPECIATION EVENT: | |
{ | |
$num_spec_nodes_in_tree++; | |
if (!($SPECIATIONS_IN_TREE{$treefam_family})) | |
{ | |
$SPECIATIONS_IN_TREE{$treefam_family} = $species; | |
} | |
else | |
{ | |
$SPECIATIONS_IN_TREE{$treefam_family} = $SPECIATIONS_IN_TREE{$treefam_family}.",".$species; | |
} | |
} | |
else { print STDERR "ERROR: node_type $node_type treefam_family $treefam_family\n"; exit;} | |
} | |
else # THIS MUST BE A LEAF NODE OR HAS NOT BEEN MARKED AS A SPECIATION BY MISTAKE. | |
{ | |
$num_spec_nodes_in_tree++; | |
if (!($SPECIATIONS_IN_TREE{$treefam_family})) | |
{ | |
$SPECIATIONS_IN_TREE{$treefam_family} = $species; | |
} | |
else | |
{ | |
$SPECIATIONS_IN_TREE{$treefam_family} = $SPECIATIONS_IN_TREE{$treefam_family}.",".$species; | |
} | |
} | |
} | |
else | |
{ | |
if ($VERBOSE == 1) { print "treefam_family $treefam_family excluding node $species\n";} | |
} | |
} | |
} | |
if ($NUM_DUP_NODES_IN_TREE{$treefam_family}) { print STDERR "ERROR: already have #dup nodes in $treefam_family\n"; exit;} | |
if ($NUM_SPEC_NODES_IN_TREE{$treefam_family}) { print STDERR "ERROR: already have #spec nodes in $treefam_family\n"; exit;} | |
$NUM_DUP_NODES_IN_TREE{$treefam_family} = $num_dup_nodes_in_tree; | |
$NUM_SPEC_NODES_IN_TREE{$treefam_family} = $num_spec_nodes_in_tree; | |
#---------------------------------------------------------------------------------# | |
# GET ALL THE TRANSCRIPTS IN THIS FAMILY CORRESPONDING TO THE PARALOGS THAT WE | |
# ARE INTERESTED IN: | |
$species_genes = $GENES{$treefam_family}; | |
@species_genes = split(/\,/,$species_genes); | |
@species_nodes = &Avril_modules::empty_array(@species_nodes); | |
@leaves = $tree->get_leaves; | |
foreach $species_gene(@species_genes) | |
{ | |
$species_node = "none"; | |
foreach $leaf(@leaves) | |
{ | |
# FIND THE GENE FOR THIS NODE: | |
$gh = $leaf->gene; | |
$gene = $gh->ID(); # eg. ENSMUSG00000022770 | |
if ($gene eq $species_gene) | |
{ | |
$species_node = $leaf; | |
goto FOUND_NODE; | |
} | |
} | |
FOUND_NODE: | |
if ($species_node eq "none") { print STDERR "ERROR: did not find node for $species_gene\n"; exit;} | |
@species_nodes = (@species_nodes,$species_node); | |
} | |
# GET THE ROOT NODE OF THE TREE: | |
$root_node = $tree->root(); | |
# FOR EACH OF THE PARALOGS OF INTEREST IN THE TREE, GATHER SOME INFORMATION: | |
for ($i = 0; $i <= $#species_nodes; $i++) | |
{ | |
$species_node_i = $species_nodes[$i]; | |
$species_gene_i = $species_genes[$i]; | |
#---------------------------------------------------------------------------# | |
# FIND THE NUMBER OF DUPLICATIONS/SPECIATIONS BETWEEN $species_node_i AND $root_node: | |
$num_dup_nodes_i_root = 0; | |
$num_spec_nodes_i_root = 0; | |
$node_i = $species_node_i; | |
FIND_NUM_DUP_NODES_I_ROOT: | |
# NOTE THAT WE WANT TO INCLUDE THE ROOT NODE OF THESE TREES IN THE COUNT. | |
if (defined($node_i->parent)) | |
{ | |
$parent = $node_i->parent; | |
# FIND THE SPECIES FOR THIS NODE: | |
if (defined($parent->get_tag_value('S'))) | |
{ | |
$species = $parent->get_tag_value('S'); | |
} | |
else | |
{ | |
print "WARNING: treefam_family $treefam_family: do not know species of node\n"; | |
$species = "none"; | |
} | |
$species =~ tr/[a-z]/[A-Z]/; | |
# IF THIS IS A SPECIES WE WANT TO TAKE: | |
if ($TAKE{$species}) | |
{ | |
if (defined($parent->get_tag_value('D'))) | |
{ | |
$type_i = $parent->get_tag_value('D'); # Y FOR DUPLICATION, N FOR SPECIATION | |
# IF IT IS A DUPLICATION NODE (Y) | |
if ($type_i eq 'Y') { $num_dup_nodes_i_root++; } | |
elsif ($type_i eq 'N') | |
{ | |
$num_spec_nodes_i_root++; | |
if (!($SPECIATIONS_I{$species_gene_i})) | |
{ | |
$SPECIATIONS_I{$species_gene_i} = $species; | |
} | |
else | |
{ | |
$SPECIATIONS_I{$species_gene_i} = $SPECIATIONS_I{$species_gene_i}.",".$species; | |
} | |
} | |
else { print STDERR "ERROR: type_i $type_i treefam_family $treefam_family\n"; exit;} | |
} | |
else # IF THE NODE TYPE IS NOT DEFINED, ASSUME IT IS A SPECIATION NODE. | |
{ | |
$num_spec_nodes_i_root++; | |
if (!($SPECIATIONS_I{$species_gene_i})) | |
{ | |
$SPECIATIONS_I{$species_gene_i} = $species; | |
} | |
else | |
{ | |
$SPECIATIONS_I{$species_gene_i} = $SPECIATIONS_I{$species_gene_i}.",".$species; | |
} | |
} | |
} | |
else | |
{ | |
if ($VERBOSE == 1) { print "treefam_family $treefam_family excluding parent node $species...\n";} | |
} | |
$node_i = $parent; | |
if ($parent eq $root_node) | |
{ | |
goto FOUND_ROOT1; | |
} | |
goto FIND_NUM_DUP_NODES_I_ROOT; | |
} | |
FOUND_ROOT1: | |
# RECORD $num_dup_nodes_i: | |
if ($NUM_DUP_NODES_I{$species_gene_i}) { print STDERR "ERROR: already have num_dup_nodes_i for $species_gene_i\n"; exit;} | |
$NUM_DUP_NODES_I{$species_gene_i} = $num_dup_nodes_i_root; | |
if ($NUM_SPEC_NODES_I{$species_gene_i}) { print STDERR "ERROR: already have num_spec_nodes_i for $species_gene_i\n"; exit;} | |
$NUM_SPEC_NODES_I{$species_gene_i} = $num_spec_nodes_i_root; | |
#---------------------------------------------------------------------------# | |
} | |
NEXT_FAMILY: | |
} | |
#------------------------------------------------------------------# | |
# PRINT OUT THE INFORMATION FOR EACH PAIR OF GENES: | |
open(LIST,"$list") || die "ERROR: cannot open $list\n"; | |
while(<LIST>) | |
{ | |
$line = $_; | |
chomp $line; | |
@temp = split(/\s+/,$line); | |
$gene1 = $temp[0]; | |
$gene2 = $temp[1]; | |
# GET THE FAMILY FOR EACH GENE: | |
if (!($FAMILY{$gene1})) { print STDERR "ERROR: do not know family for $gene1\n"; exit;} | |
if (!($FAMILY{$gene2})) { print STDERR "ERROR: do not know family for $gene2\n"; exit;} | |
$AC1 = $FAMILY{$gene1}; | |
$AC2 = $FAMILY{$gene2}; | |
$AC = $AC1."/".$AC2; | |
# GET THE NUMBER OF GENES IN THE TREE FOR EACH FAMILY: | |
if (!($NUM_GENES_IN_TREE{$AC1})) { print STDERR "ERROR: do not know #genes in $AC1\n"; exit;} | |
if (!($NUM_GENES_IN_TREE{$AC2})) { print STDERR "ERROR: do not know #genes in $AC2\n"; exit;} | |
$num_genes_in_tree1 = $NUM_GENES_IN_TREE{$AC1}; | |
$num_genes_in_tree2 = $NUM_GENES_IN_TREE{$AC2}; | |
$num_genes_in_tree = $num_genes_in_tree1 + $num_genes_in_tree2; | |
# GET THE NUMBER OF C. ELEGANS GENES IN THE TREE FOR EACH FAMILY: | |
if (!($NUM_SPECIES_GENES_IN_TREE{$AC1})) { print STDERR "ERROR: do not know #CAEEL genes in $AC1\n"; exit;} | |
if (!($NUM_SPECIES_GENES_IN_TREE{$AC2})) { print STDERR "ERROR: do not know #CAEEL genes in $AC2\n"; exit;} | |
$num_species_genes_in_tree1 = $NUM_SPECIES_GENES_IN_TREE{$AC1}; | |
$num_species_genes_in_tree2 = $NUM_SPECIES_GENES_IN_TREE{$AC2}; | |
$num_species_genes_in_tree = $num_species_genes_in_tree1 + $num_species_genes_in_tree2; | |
# GET THE NUMBER OF DUPLICATION AND SPECIATION NODES IN EACH FAMILY: | |
if (!($NUM_DUP_NODES_IN_TREE{$AC1})) { $num_dup_nodes_in_tree1 = 0; } | |
else { $num_dup_nodes_in_tree1 = $NUM_DUP_NODES_IN_TREE{$AC1};} | |
if (!($NUM_DUP_NODES_IN_TREE{$AC2})) { $num_dup_nodes_in_tree2 = 0; } | |
else { $num_dup_nodes_in_tree2 = $NUM_DUP_NODES_IN_TREE{$AC2};} | |
# WE ADD ONE EXTRA DUPLICATION NODE FOR THE INFERRED DUPLICATION NODE THAT GAVE RISE TO THE TWO PARALOGS: | |
$num_dup_nodes_in_tree = $num_dup_nodes_in_tree1 + $num_dup_nodes_in_tree2 + 1; | |
if (!($NUM_SPEC_NODES_IN_TREE{$AC1})) { $num_spec_nodes_in_tree1 = 0; } | |
else { $num_spec_nodes_in_tree1 = $NUM_SPEC_NODES_IN_TREE{$AC1};} | |
if (!($NUM_SPEC_NODES_IN_TREE{$AC2})) { $num_spec_nodes_in_tree2 = 0; } | |
else { $num_spec_nodes_in_tree2 = $NUM_SPEC_NODES_IN_TREE{$AC2};} | |
$num_spec_nodes_in_tree = $num_spec_nodes_in_tree1 + $num_spec_nodes_in_tree2; | |
# GET THE NUMBER OF DUPLICATION AND SPECIATION NODES FOR THE CLADE DEFINED BY THE DUPLICATION | |
# NODE THAT GAVE RISE TO THE TWO PARALOGS (TWO FAMILIES): | |
# $num_genes_in_clade IS THE NUMBER OF GENES IN BOTH TREES TOGETHER: | |
$num_genes_in_clade = $num_genes_in_tree; | |
# $num_species_genes_in_clade IS THE NUMBER OF C. ELEGANS GENES IN BOTH TREES TOGETHER: | |
$num_species_genes_in_clade = $num_species_genes_in_tree; | |
# $num_dup_nodes_in_clade IS THE NUMBER OF DUPLICATION NODES IN BOTH TREES TOGETHER: | |
$num_dup_nodes_in_clade = $num_dup_nodes_in_tree; | |
# $num_spec_nodes_in_clade IS THE NUMBER OF SPECIATION NODES IN BOTH TREES TOGETHER: | |
$num_spec_nodes_in_clade = $num_spec_nodes_in_tree; | |
# GET THE NUMBER OF DUPLICATION NODES FROM $gene1 TO THE ROOT OF TREE 1: | |
if (!($GID{$gene1})) { print STDERR "ERROR: do not know GID for $gene1\n"; exit;} | |
$GID1 = $GID{$gene1}; | |
if (!($NUM_DUP_NODES_I{$GID1})) { $num_dup_nodes1 = 0; } | |
else { $num_dup_nodes1 = $NUM_DUP_NODES_I{$GID1}; } | |
if (!($GID{$gene2})) { print STDERR "ERROR: do not know GID for $gene2\n"; exit;} | |
$GID2 = $GID{$gene2}; | |
if (!($NUM_DUP_NODES_I{$GID2})) { $num_dup_nodes2 = 0; } | |
else { $num_dup_nodes2 = $NUM_DUP_NODES_I{$GID2}; } | |
# GET THE NUMBER OF SPECIATION NODES FROM $gene1 TO THE ROOT OF THE TREE 1: | |
if (!($NUM_SPEC_NODES_I{$GID1})) { $num_spec_nodes1 = 0; } | |
else { $num_spec_nodes1 = $NUM_SPEC_NODES_I{$GID1};} | |
if (!($NUM_SPEC_NODES_I{$GID2})) { $num_spec_nodes2 = 0; } | |
else { $num_spec_nodes2 = $NUM_SPEC_NODES_I{$GID2};} | |
# GET THE NUMBER OF SPECIATION EVENTS FROM $gene1 TO THE ROOT OF THE TREE 1: | |
if (!($SPECIATIONS_I{$GID1})) { $num_spec_events1 = 0; } | |
else | |
{ | |
$num_spec_events1 = $SPECIATIONS_I{$GID1}; | |
$num_spec_events1 = &count_speciations($num_spec_events1); | |
} | |
if (!($SPECIATIONS_I{$GID2})) { $num_spec_events2 = 0; } | |
else | |
{ | |
$num_spec_events2 = $SPECIATIONS_I{$GID2}; | |
$num_spec_events2 = &count_speciations($num_spec_events2); | |
} | |
# FIND THE NUMBER OF SPECIATIONS IN THE WHOLE TREE: | |
if (!($SPECIATIONS_IN_TREE{$AC1})) { print STDERR "ERROR: do not know speciations in $AC1 tree\n"; exit;} | |
if (!($SPECIATIONS_IN_TREE{$AC2})) { print STDERR "ERROR: do not know speciations in $AC2 tree\n"; exit;} | |
$speciations1 = $SPECIATIONS_IN_TREE{$AC1}; | |
$speciations2 = $SPECIATIONS_IN_TREE{$AC2}; | |
$speciations = $speciations1.",".$speciations2; | |
$num_speciations = &count_speciations($speciations); | |
$num_speciations1 = &count_speciations($speciations1); | |
$num_speciations2 = &count_speciations($speciations2); | |
# PRINT OUT THE INFORMATION FOR THIS PAIR OF PARALOGS: | |
print "$AC CAEEL $gene1 $gene2 $num_genes_in_tree $num_species_genes_in_tree $num_dup_nodes_in_tree $num_spec_nodes_in_tree $num_genes_in_clade $num_species_genes_in_clade $num_dup_nodes_in_clade $num_spec_nodes_in_clade $num_dup_nodes1 $num_spec_nodes1 $num_dup_nodes2 $num_spec_nodes2 $num_dup_nodes1 $num_spec_nodes1 $num_dup_nodes2 $num_spec_nodes2 $num_speciations $num_speciations1 $num_speciations2 $num_spec_events1 $num_spec_events2\n"; | |
} | |
close(LIST); | |
#------------------------------------------------------------------# | |
print STDERR "FINISHED.\n"; | |
#------------------------------------------------------------------# | |
# COUNT SPECIATION EVENTS: | |
sub count_speciations | |
{ | |
my $speciations = $_[0]; | |
my @speciations = split(/\,/,$speciations); | |
my $num = 0; | |
my %SEEN = (); | |
my $i; | |
my $speciation; | |
for ($i = 0; $i <= $#speciations; $i++) | |
{ | |
$speciation = $speciations[$i]; | |
if (!($SEEN{$speciation})) | |
{ | |
$num++; | |
$SEEN{$speciation} = 1; | |
} | |
} | |
return($num); | |
} | |
#------------------------------------------------------------------# | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment