Created
March 1, 2013 14:16
-
-
Save avrilcoghlan/5064924 to your computer and use it in GitHub Desktop.
Perl script that identifies singleton genes in a species, by finding genes from that species that appear in TreeFam families that do not have any other genes from that species
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#! /usr/bin/perl | |
# Avril Coghlan | |
# 24-Feb-09 | |
# Perl script to identify singleton genes in a species, defined by genes | |
# that do not appear in a TreeFam family with any gene from the same species. | |
use DBI; | |
use lib "/home/bcri/acoghlan/perlmodulesAvril/"; # edit to point to the TreeFam perl api xxx | |
use Treefam::DBConnection; | |
$the_species = $ARGV[0]; # the species name, eg. CAEEL | |
$release = $ARGV[1]; # the treefam release to use | |
$VERBOSE = 1; | |
# get all the TreeFam families that contain C. elegans genes: | |
$dbc = Treefam::DBConnection->new(); | |
$famh = $dbc->get_FamilyHandle(); | |
my @families = $famh->get_all_by_species($the_species); | |
%SINGLETON = (); | |
for ($i = 0; $i <= $#families; $i++) | |
{ | |
$family = $families[$i]; | |
# Note: it seems to be necessary to go around a quite circular route to get the | |
# treefam family object in treefam-7 (this wasn't necessary in treefam-6): | |
if ($family eq '') { print STDERR "WARNING: family $family\n"; goto NEXT_FAMILY;} | |
if ($VERBOSE == 1) { print "1. Looking at family $family...\n";} | |
if (defined($family->AC())) { $family = $family->AC(); } | |
if ($VERBOSE == 1) { print "2. Looking at family $family...\n";} | |
# GET THE FAMILY FOR THIS TREE: | |
$famh = $dbc->get_FamilyHandle(); | |
$family = $famh->get_by_id($family); | |
# GET THE TREE FOR THE FAMILY: | |
if (!(defined($family->get_tree('clean')))) | |
{ | |
print "WARNING: AC $AC: there is no clean tree!\n"; | |
goto NEXT_FAMILY; | |
} | |
$tree = $family->get_tree('clean'); # GET THE TREEFAM CLEAN TREE. | |
if (!(defined($tree->get_leaves))) | |
{ | |
print "WARNING: AC $AC: no leaves in tree!\n"; | |
goto NEXT_FAMILY; | |
} | |
$num_species_genes = 0; | |
$species_genes = ""; | |
@nodes = $tree->get_leaves; | |
foreach $node(@nodes) | |
{ | |
# FIND THE SPECIES FOR THIS TRANSCRIPT: | |
if (defined($node->get_tag_value('S'))) | |
{ | |
$species = $node->get_tag_value('S'); | |
} | |
else | |
{ | |
print "WARNING: AC $AC: do not know species of node\n"; | |
$species = "none"; | |
} | |
$species =~ tr/[a-z]/[A-Z]/; | |
if ($species eq $the_species) | |
{ | |
$num_species_genes++; | |
# FIND THE GENE FOR THIS TRANSCRIPT: | |
if ($the_species eq 'CAEEL' && $release eq '6') | |
{ | |
$gene = $node->sequence_id(); # eg. T24D1.1b, note $node->gene gives the wbg gene name | |
$gene =~ tr/[a-z]/[A-Z]/; | |
$last_letter = substr($gene,length($gene)-1,1); | |
if ($last_letter =~ /[A-Z]/) { chop($gene);} | |
} | |
elsif ($the_species eq 'CAEEL' && $release eq '7') | |
{ | |
$gene = $node->gene(); | |
$gene = $gene->ID(); | |
} | |
elsif ($the_species eq 'CAEEL' && $release eq '5') | |
{ | |
$gene = $node->gene(); # eg. MTCE.3 or T24D1.1 | |
$gene = $gene->ID(); | |
} | |
elsif ($the_species eq 'CAEEL' && $release eq '4') | |
{ | |
$gene = $node->gene(); # eg. MTCE.3 or T24D1.1 | |
$gene = $gene->ID(); | |
} | |
elsif (($the_species eq 'HUMAN' || $the_species eq 'MOUSE') && $release eq '7') | |
{ | |
$gene = $node->gene(); # eg. ENSG00000215614 | |
$gene = $gene->ID(); | |
} | |
elsif (($the_species eq 'HUMAN' || $the_species eq 'MOUSE') && $release eq '6') | |
{ | |
$gene = $node->gene(); # eg. ENSG00000215614 | |
$gene = $gene->ID(); | |
} | |
elsif (($the_species eq 'HUMAN' || $the_species eq 'MOUSE') && $release eq '5') | |
{ | |
$gene = $node->gene(); # eg. ENSG00000215614 | |
$gene = $gene->ID(); | |
} | |
elsif (($the_species eq 'HUMAN' || $the_species eq 'MOUSE') && $release eq '4') | |
{ | |
$gene = $node->gene(); # eg. ENSG00000215614 | |
$gene = $gene->ID(); | |
} | |
elsif ($the_species eq 'DROME' && $release eq '7') | |
{ | |
$gene = $node->gene(); | |
$gene = $gene->ID(); | |
} | |
else | |
{ | |
print STDERR "ERROR: the_species $the_species release $release\n"; | |
exit; | |
} | |
$species_genes = $species_genes.",".$gene; | |
} | |
} | |
if ($num_species_genes == 1) | |
{ | |
$gene = substr($species_genes,1,length($species_genes)-1); | |
if (!($SINGLETON{$gene})) { $SINGLETON{$gene} = "yes";} | |
} | |
elsif ($num_species_genes >= 2) | |
{ | |
$species_genes = substr($species_genes,1,length($species_genes)-1); | |
@species_genes = split(/\,/,$species_genes); | |
for ($j = 0; $j <= $#species_genes; $j++) | |
{ | |
$gene = $species_genes[$j]; | |
$SINGLETON{$gene} = "no"; | |
} | |
} | |
NEXT_FAMILY: | |
} | |
# PRINT OUT ALL THE SINGLETON GENES: | |
foreach $gene (keys %SINGLETON) | |
{ | |
if ($SINGLETON{$gene} eq 'yes') | |
{ | |
print "$gene\n"; | |
} | |
} | |
print STDERR "FINISHED\n"; | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment