Created
March 1, 2013 13:22
-
-
Save avrilcoghlan/5064629 to your computer and use it in GitHub Desktop.
Perl script to identify gene losses in human since divergence from chimp, based on TreeFam trees
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/local/bin/perl | |
# | |
# Perl script treefam_genelosses.pl | |
# Written by Avril Coghlan ([email protected]). | |
# 28-Aug-06. | |
# | |
# For the TreeFam project. | |
# | |
# This perl script connects to the MYSQL database of | |
# TreeFam families and prints out a list of gene losses from | |
# fully sequenced genomes. This script uses Jean-Karim's TreeFam | |
# API. Finds losses in TreeFam-A clean trees (not seed, as misses fully sequenced genomes, | |
# and not full trees, as are sometimes wrong). xxx use TreeFam-B too? | |
# | |
# The command-line format is: | |
# % perl <treefam_genelosses.pl> | |
# | |
#------------------------------------------------------------------# | |
# CHECK IF THERE ARE THE CORRECT NUMBER OF COMMAND-LINE ARGUMENTS: | |
$num_args = $#ARGV + 1; | |
if ($num_args != 0) | |
{ | |
print "Usage of treefam_genelosses.pl\n\n"; | |
print "perl -w treefam_genelosses.pl\n"; | |
print "For example, >perl -w treefam_genelosses.pl\n"; | |
exit; | |
} | |
# DECLARE MYSQL USERNAME AND HOST: | |
use strict; | |
use Treefam::DBConnection; | |
my $VERBOSE = 0; # IF $VERBOSE==1, PRINT OUT EXTRA DETAILS. | |
my $dbc = Treefam::DBConnection->new(); | |
#------------------------------------------------------------------# | |
# GET ALL THE TREEFAM-A FAMILIES IN THE CURRENT RELEASE: | |
my $famh = $dbc->get_FamilyHandle(); # GET A FAMILY HANDLE. | |
my @families = $famh->get_all_by_type('A'); # GET ALL TREEFAM-A FAMILIES. | |
my $no_families = 0; | |
foreach my $family(@families) | |
{ | |
my $AC = $family->ID(); # GET THE FAMILY ID. | |
$no_families++; | |
print "\n$no_families: reading family $AC...\n"; | |
my $tree = $family->tree('clean'); # GET THE TREEFAM-A CLEAN TREE. xxx look at treefam-b too? | |
# GET THE NAMES OF ALL THE GENES IN THIS TREE: | |
my @leaves = $tree->get_leaf_nodes; | |
foreach my $leaf(@leaves) | |
{ | |
my @tags = $leaf->get_all_tags(); | |
my $is_gene_loss = 0; | |
my $loss = "none"; | |
foreach my $tag(@tags) | |
{ | |
# THE TAGS ARE: | |
# G: GENE ID, eg. SPBC12D12.06 OR ENSG00000112237 | |
# O: TREEFAM ID. eg. SPBC12D12.06 OR ENST00000326298.2 | |
# S: SPECIES, eg. SCHP0 OR HUMAN | |
# E: GENE LOSS | |
# B: BOOTSTRAP VALUE | |
for my $value ($leaf->get_tag_values($tag)) | |
{ | |
if ($tag eq 'E') | |
{ | |
$loss = $value; | |
$is_gene_loss = 1; | |
my @temp = split(/\$\-/,$loss); | |
$loss = $temp[1]; | |
if ($loss ne 'HUMAN' && $loss ne 'Homo') { goto NEXT_LEAF;} # xxx JUST LOOKING AT LOSES OF HUMAN GENES WHERE THERE IS A CHIMP GENE. | |
# NOTE THAT IF CHIMP AND HUMAN WERE LOST, THEN $loss WOULD BE 'HUMAN,PANTR'. | |
print "___ $AC: ID ",$leaf->id(),"\n"; # THIS IS SYMBOL."_".SPECIES, eg. CCNC_HUMAN | |
print "___ GENE LOSS $loss (AC $AC)\n"; | |
} | |
} | |
} | |
# IF THIS NODE $leaf->id() HAS A GENE LOSS ASSOCIATED WITH IT, CHECK WHETHER | |
# THE CLADE THAT IT IS IN IS GOOD EVIDENCE FOR THE LOSS: | |
my $parent = 'none'; | |
if ($is_gene_loss == 1) | |
{ | |
#---------------------------------------------------------# | |
FIND_PARENT: | |
# FIND THE PARENT NODE OF NODE $leaf: | |
if ($parent eq 'none') | |
{ | |
if (defined($leaf->ancestor)) { $parent = $leaf->ancestor; } | |
else { goto ALL_PARENTS_FOUND; } | |
} | |
else # GET THE PARENT NODE OF NODE $parent: | |
{ | |
if (defined($parent->ancestor)){ $parent = $parent->ancestor;} | |
else { goto ALL_PARENTS_FOUND; } | |
} | |
if ($VERBOSE == 1) { print "___ PARENT OF CURRENT NODE is", $parent->internal_id(),"\n";} | |
# FIND THE DESCENDENTS OF NODE $parent, AND CHECK IF THE CLADE OF DESCENDENTS | |
# IS GOOD EVIDENCE FOR THE GENE LOSS $loss: | |
my %SPECIES = (); # HASH TABLE TO RECORD SPECIES IN THIS CLADE. | |
for my $descendent ($parent->get_all_Descendents()) | |
{ | |
if (defined($descendent->id())) # IF THIS IS A LEAF. | |
{ | |
my $id = $descendent->id(); | |
if ($VERBOSE == 1) { print "___ WHICH HAS DESCENDENT $id\n";} | |
my @temp = split(/_/,$id); | |
my $species = $temp[$#temp]; | |
$SPECIES{$species} = 1; | |
} | |
} | |
# CHECK WHETHER THE CLADE OF DESCENDENTS IS GOOD EVIDENCE FOR THE GENE LOSS $loss: | |
my $evidence_for_loss= &check_evidence_for_loss($loss,\%SPECIES); | |
if ($VERBOSE == 1) { print "___ EVIDENCE FOR LOSS IN $loss: $evidence_for_loss\n";} | |
if ($evidence_for_loss == 1) { goto ALL_PARENTS_FOUND;} | |
# FIND THE PARENT OF NODE $parent: | |
goto FIND_PARENT; | |
#---------------------------------------------------------# | |
ALL_PARENTS_FOUND: | |
if ($VERBOSE == 1) { print "___ ALL PARENTS FOUND\n\n";} | |
# IF THE CLADE OF DESCENDENTS IS GOOD EVIDENCE FOR GENE LOSS $loss, CHECK IF THERE | |
# ARE ANY OTHER LOSSES IN THAT CLADE THAT MAY MAKE $loss LESS BELIEVABLE: | |
print "___ EVIDENCE FOR LOSS IN $loss: $evidence_for_loss\n"; | |
if ($evidence_for_loss == 1) | |
{ | |
for my $descendent ($parent->get_all_Descendents()) | |
{ | |
if (defined($descendent->id())) # IF THIS IS A LEAF. | |
{ | |
my $id = $descendent->id(); | |
my ($loss2) = $descendent->get_tag_values('E'); | |
if (defined($loss2)) | |
{ | |
my @temp2 = split(/\$-/,$loss2); | |
my $species2= $temp2[1]; | |
print "descendent $id -> loss $loss2\n"; | |
if (($loss eq 'HUMAN' || $loss eq 'Homo') && ($species2 =~ /PANTR/ || $species2 =~ /Pan/)) # xxx add other cases | |
{ | |
$evidence_for_loss = 0; | |
} | |
} | |
} | |
} | |
} | |
print "___ FINAL EVIDENCE FOR LOSS IN $loss: now $evidence_for_loss\n"; | |
#---------------------------------------------------------# | |
} | |
NEXT_LEAF: # xxx | |
} | |
} | |
print "\n\n"; | |
print STDERR "FINISHED.\n"; | |
#------------------------------------------------------------------# | |
# SUBROUTINE TO CHECK WHETHER A CLADE OF GENES IS GOOD EVIDENCE FOR A GENE LOSS: | |
sub check_evidence_for_loss | |
{ | |
my $loss = $_[0]; # THE SPECIES IN WHICH THE PUTATIVE GENE LOSS OCCURRED. | |
my $SPECIES = $_[1]; # POINTER TO HASH TABLE OF SPECIES IN THE CLADE THAT SPECIES $species IS MISSING FROM. | |
my $evidence = 1; # SAYS WHETHER THE EVIDENCE FOR GENE LOSS IS STRONG. | |
my @all = ('HUMAN','PANTR','MACMU','BOVIN','CANFA','MOUSE','RAT','MONDO', | |
'CHICK','XENTR','TETNG','FUGRU','BRARE','CIOIN','DROME','DROPS', | |
'CAEEL','CAEBR','CAERE','APIME','ANOGA','YEAST','SCHPO','ARATH','SCHMA'); | |
# ALL THE FULLY SEQUENCED SPECIES. | |
my @expect; # THE SPECIES THAT WE EXPECT TO SEE IN THE CLADE. | |
my $i; # | |
my %EXPECT = (); # | |
my $expecti; # | |
my @expecti; # | |
my $j; # | |
my $found_expecti = 0; # | |
my $loss2; # | |
# CHECK WHETHER THERE IS STRONG EVIDENCE FOR A GENE LOSS: # xxx ADD MORE CASES | |
# FOR EXAMPLE, WE CAN BE SURE OF A HUMAN GENE LOSS IF THE CLADE CONTAINS AT LEAST | |
# ONE CHIMP OR ONE MACAQUE, AT LEAST ONE RAT OR ONE MOUSE, AT LEAST ONE COW, AND | |
# AT LEAST ONE DOG GENE: | |
if ($loss eq 'HUMAN') { @expect = ('PANTR','MACMU','MOUSE/RAT','BOVIN/CANFA'); } | |
# CHECK THAT WE SEE ALL THE SPECIES THAT WE EXPECT IN THE CLADE, AND NO OTHER SPECIES: | |
for ($i = 0; $i <= $#expect; $i++) | |
{ | |
$expecti = $expect[$i]; | |
@expecti = split(/\//,$expecti); | |
$found_expecti = 0; | |
# CHECK WHETHER WE SEE AT LEAST ONE OF PANTR/MACMU FOR EXAMPLE: | |
for ($j = 0; $j <= $#expecti; $j++) | |
{ | |
$EXPECT{$expecti[$j]} = 1; | |
if ($SPECIES->{$expecti[$j]}) { $found_expecti = 1;} | |
} | |
if ($found_expecti == 0) { $evidence = 0;} # WE DON'T FIND A SPECIES THAT WE EXPECT TO SEE IN THE CLADE. | |
} | |
# CHECK IF WE SEE ANY SPECIES IN THE CLADE THAT WE DON'T EXPECT TO SEE: | |
for ($i = 0; $i <= $#all; $i++) | |
{ | |
if (!($EXPECT{$all[$i]}) && $all[$i] ne $loss) | |
{ | |
if ($SPECIES->{$all[$i]}) { $evidence = 0;} # WE FIND A SPECIES THAT WE DON'T EXPECT TO SEE IN THE CLADE. | |
} | |
} | |
return($evidence); | |
} | |
#------------------------------------------------------------------# |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment