Created
March 1, 2013 13:26
-
-
Save avrilcoghlan/5064651 to your computer and use it in GitHub Desktop.
Perl script to find genes that appear in more than one TreeFam-A seed tree
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/local/bin/perl | |
# | |
# Perl script treefam_overlaps2.pl | |
# Written by Avril Coghlan ([email protected]). | |
# 25-Aug-06. | |
# | |
# For the TreeFam project. | |
# | |
# This perl script connects to the MYSQL database of | |
# TreeFam families and prints out a list of the genes that | |
# appear in more than one TreeFam-A seed tree. This script | |
# uses Jean-Karim's TreeFam API. | |
# | |
# The command-line format is: | |
# % perl <treefam_overlaps2.pl> | |
# | |
#------------------------------------------------------------------# | |
# CHECK IF THERE ARE THE CORRECT NUMBER OF COMMAND-LINE ARGUMENTS: | |
$num_args = $#ARGV + 1; | |
if ($num_args != 0) | |
{ | |
print "Usage of treefam_overlaps2.pl\n\n"; | |
print "perl -w treefam_overlaps2.pl\n"; | |
print "For example, >perl -w treefam_overlaps2.pl\n"; | |
exit; | |
} | |
# DECLARE MYSQL USERNAME AND HOST: | |
use strict; | |
use Treefam::DBConnection; | |
my $VERBOSE = 0; # IF $VERBOSE==1, PRINT OUT EXTRA DETAILS. | |
my $dbc = Treefam::DBConnection->new(); | |
#------------------------------------------------------------------# | |
my %FAMILY = (); # HASH TABLE TO KEEP A RECORD OF THE TREEFAM-A FAMILIES THAT A GENE IS IN. | |
# GET ALL THE TREEFAM-A FAMILIES IN THE CURRENT RELEASE: | |
my $famh = $dbc->get_FamilyHandle(); # GET A FAMILY HANDLE. | |
my @families = $famh->get_all_by_type('A'); # GET ALL TREEFAM-A FAMILIES. | |
my $no_families = 0; | |
foreach my $family(@families) | |
{ | |
if ($VERBOSE == 1) { print $family->ID,"\n";} | |
my $AC = $family->ID(); # GET THE FAMILY ID. | |
$no_families++; | |
print STDERR "$no_families: reading family $AC...\n"; | |
my $tree = $family->tree('seed'); # GET THE TREEFAM-A SEED TREE. | |
# GET THE NAMES OF ALL THE GENES IN THIS TREE: | |
my @leaves = $tree->get_leaf_nodes; | |
foreach my $leaf(@leaves) | |
{ | |
my @tags = $leaf->get_all_tags(); | |
if ($VERBOSE == 1) { print "ID: ",$leaf->id(),"\t"; } # THIS IS THE SYMBOL."_".SPECIES, eg. srb11_SCHP0 OR CCNC_HUMAN | |
foreach my $tag(@tags) | |
{ | |
# THE TAGS ARE: | |
# G: GENE ID, eg. SPBC12D12.06 OR ENSG00000112237 | |
# O: TREEFAM ID. eg. SPBC12D12.06 OR ENST00000326298.2 | |
# S: SPECIES, eg. SCHP0 OR HUMAN | |
for my $value ($leaf->get_tag_values($tag)) | |
{ | |
if ($tag eq 'G') | |
{ | |
my $GID = $value; | |
if ($VERBOSE == 1) { print "gene $GID (AC $AC)\n";} | |
# REMEMBER THE FAMILIES THAT THIS GENE IS IN: | |
if (!($FAMILY{$GID})) { $FAMILY{$GID} = $AC; } | |
else { $FAMILY{$GID} = $FAMILY{$GID}.",".$AC;} | |
} | |
} | |
} | |
} | |
} | |
if ($VERBOSE == 1) { print "\n\n";} | |
#------------------------------------------------------------------# | |
# PRINT OUT A LIST OF THE GENES THAT APPEAR IN TREEFAM-A SEED FAMILIES, AND THE | |
# FAMILIES THAT THEY APPEAR IN: | |
print "GENE NUMBER_OF_FAMILIES FAMILIES\n"; | |
my $no_overlapping_families = 0; | |
my $no_genes = 0; | |
my %SEEN = (); | |
foreach my $GID (keys %FAMILY) | |
{ | |
my $family = $FAMILY{$GID}; | |
my @family = split(/\,/,$family); # THIS IS A LIST OF THE FAMILIES THAT A GENE APPEARS IN. | |
my $no_families = $#family + 1; # THIS IS THE NUMBER OF FAMILIES THAT A GENE APPEARS IN. | |
if ($no_families > 1) | |
{ | |
if ($SEEN{$GID}) { print STDERR "ERROR: already counted gene $GID.\n"; exit;} | |
$SEEN{$GID} = 1; | |
$no_genes++; | |
print "$GID $no_families $family\n"; | |
# CHECK IF WE HAVE SEEN THESE FAMILIES BEFORE: | |
for (my $i = 0; $i < $no_families; $i++) | |
{ | |
my $this_family = $family[$i]; | |
if (!($SEEN{$this_family})) | |
{ | |
$no_overlapping_families++; | |
$SEEN{$this_family} = 1; | |
} | |
} | |
} | |
} | |
print "There are $no_genes genes in more than one TreeFam-A seed family.\n"; | |
print "They are in $no_overlapping_families different TreeFam-A seed families.\n"; | |
#------------------------------------------------------------------# | |
print STDERR "FINISHED.\n"; | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment