Created
March 1, 2013 15:17
-
-
Save avrilcoghlan/5065299 to your computer and use it in GitHub Desktop.
Perl script that connects to the TreeFam mysql database, and prints out a list of Caenorhabditis elegans and Caenorhabditis briggsae genes in TreeFam families
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/local/bin/perl | |
# | |
# Perl script list_treefam_genes3.pl | |
# Written by Avril Coghlan ([email protected]). | |
# 18-JAN-06. | |
# Updated 6-Dec-07. | |
# | |
# For the TreeFam project. | |
# | |
# This perl script connects to the MYSQL database of | |
# TreeFam families and prints out a list of the C. elegans and | |
# C. briggsae genes in TreeFam families. | |
# | |
# The output has the format: | |
# WORM_GENE NUMBER_OF_FAMILIES FAMILIES | |
# where WORM_GENE is the gene name, eg., R13F6.4 for a C. elegans | |
# gene or CBG100063 for a C. briggsae gene, | |
# NUMBER_OF_FAMILIES is the number of TreeFam families that | |
# WORM_GENE appears in, | |
# FAMILIES is a list of the families that WORM_GENE is in. | |
# | |
# The command-line format is: | |
# % perl <list_treefam_genes3.pl> | |
# | |
#------------------------------------------------------------------# | |
# CHECK IF THERE ARE THE CORRECT NUMBER OF COMMAND-LINE ARGUMENTS: | |
$num_args = $#ARGV + 1; | |
if ($num_args != 0) | |
{ | |
print "Usage of list_treefam_genes3.pl\n\n"; | |
print "perl -w list_treefam_genes3.pl\n"; | |
print "For example, >perl -w list_treefam_genes3.pl\n"; | |
exit; | |
} | |
# DECLARE MYSQL USERNAME AND HOST: | |
use DBI; | |
#------------------------------------------------------------------# | |
# GET THE LONG NAMES OF THE TREEFAM GENES FROM THE MYSQL DATABASE: | |
%WORM = (); # HASH TABLE TO KEEP A LIST OF WORM GENES IN TREEFAM. | |
$database = 'treefam_4'; | |
$dbh = DBI->connect("dbi:mysql:treefam_4:db.treefam.org:3308", 'anonymous', '') || return; | |
$table_w = 'genes'; | |
# THIS TABLE HAS THE ID AND DISPLAY ID. SOMETIMES THE DISPLAY ID IS | |
# THE UNIPROT NAME, SOMETIMES THE ID IS: | |
$st = "SELECT ID, TAX_ID from $table_w"; | |
$sth = $dbh->prepare($st) or die "Cannot prepare $st: $dbh->errstr\n"; | |
$rv = $sth->execute or die "Cannot execute the query: $sth->errstr"; | |
if ($rv >= 1) | |
{ | |
while ((@array) = $sth->fetchrow_array) { | |
$ID = $array[0]; # eg., AH3.1 for a C. elegans gene or | |
# WBGene00024691 for a C. briggsae gene | |
$TAX_ID = $array[1]; # eg., 6239 for a C. elegans gene or | |
# 6238 for a C. briggsae gene | |
if ($TAX_ID == 6239 || $TAX_ID == 6238) # IT IS A C. ELEGANS OR C. BRIGGSAE GENE | |
{ | |
# REMEMBER THAT THIS WORM GENE IS IN TREEFAM: | |
$WORM{$ID} = $TAX_ID; | |
} | |
} | |
} | |
$rc = $dbh->disconnect(); | |
$rc = ""; | |
#------------------------------------------------------------------# | |
# GET THE NAMES OF ALL THE TREEFAM FAMILIES AND THE GENES THAT ARE IN | |
# THEM FROM THE MYSQL DATABASE: | |
$database = 'treefam'; | |
$dbh = DBI->connect("dbi:mysql:treefam_4:db.treefam.org:3308", 'anonymous', '') || return; | |
# FIRST READ IN TREEFAM-A AND THEN TREEFAM-B: | |
%FAMILY = (); # HASH TABLE TO KEEP A RECORD OF THE TREEFAM FAMILIES THAT A WORM GENE IS IN. | |
for ($i = 1; $i <= 2; $i++) | |
{ | |
# SPECIFY THE TABLE: | |
if ($i == 1) # LOOK AT TREEFAM-A: | |
{ | |
$table_w = 'famA_gene'; | |
} | |
elsif ($i == 2) # LOOK AT TREEFAM-B: | |
{ | |
$table_w = 'famB_gene'; | |
} | |
# THE FIRST THREE COLUMNS IN THE TABLE famB_gene/famA_gene ARE THE TRANSCRIPT NAME, FAMILY NAME AND WHETHER THE | |
# TRANSCRIPT IS IN THE SEED/FULL TREE: | |
# eg., ENSMUST00000049178.2 TF105085 FULL | |
$st = "SELECT ID, AC, FLAG from $table_w"; | |
$sth = $dbh->prepare($st) or die "Cannot prepare $st: $dbh->errstr\n"; | |
$rv = $sth->execute or die "Cannot execute the query: $sth->errstr"; | |
if ($rv >= 1) | |
{ | |
while ((@array) = $sth->fetchrow_array) { | |
$ID = $array[0]; # eg., F40G9.2.1 for a C. elegans gene OR WBGene00027163 for a C. briggsae gene. | |
$AC = $array[1]; # eg., TF105085, NAME OF THE TREEFAM FAMILY. | |
$FLAG = $array[2]; # eg., FULL OR BOTH | |
if (($FLAG eq 'FULL' || $FLAG eq 'BOTH')) # WE ARE ONLY INTERESTED IN THE FULL TREES. | |
{ | |
# CHECK IF IT IS A C. BRIGGSAE OR C. ELEGANS GENE: | |
if ($WORM{$ID}) | |
{ | |
# REMEMBER THE FAMILIES THAT THIS WORM GENE IS IN: | |
if (!($FAMILY{$ID})) { $FAMILY{$ID} = $AC; } | |
else { $FAMILY{$ID} = $FAMILY{$ID}.",".$AC;} | |
} | |
} | |
} | |
} | |
} | |
$rc = $dbh->disconnect(); | |
$rc = ""; | |
#------------------------------------------------------------------# | |
# PRINT OUT A LIST OF THE WORM GENES THAT APPEAR IN TREEFAM, AND THE | |
# FAMILIES THAT THEY APPEAR IN: | |
print "WORM_GENE NUMBER_OF_FAMILIES FAMILIES\n"; | |
foreach $ID (keys %FAMILY) | |
{ | |
$family = $FAMILY{$ID}; | |
@family = split(/\,/,$family); # THIS IS A LIST OF THE FAMILIES THAT A WORM GENE APPEARS IN. | |
$no_families = $#family + 1; # THIS IS THE NUMBER OF FAMILIES THAT A WORM GENE APPEARS IN. | |
print "$ID $no_families $family\n"; | |
} | |
#------------------------------------------------------------------# | |
print STDERR "FINISHED.\n"; | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment