Created
March 1, 2013 16:39
-
-
Save avrilcoghlan/5065912 to your computer and use it in GitHub Desktop.
Perl scrip tthat connects to the TreeFam mysql database, and finds Schistosoma mansoni genes that are single-copy, that have multiple orthologs in most other animals.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/local/bin/perl | |
# | |
# Perl script treefam_flatworm.pl | |
# Written by Avril Coghlan ([email protected]). | |
# 30-Jun-06. | |
# | |
# This perl script connects to the MYSQL database of | |
# TreeFam families and finds flatworm genes that are | |
# single-copy, that have multiple orthologs in most | |
# other animals. | |
# | |
# The command-line format is: | |
# % perl <treefam_flatworm.pl> <db> | |
# where db says whether to use TreeFamA or TreeFamB. | |
# | |
#------------------------------------------------------------------# | |
# CHECK IF THERE ARE THE CORRECT NUMBER OF COMMAND-LINE ARGUMENTS: | |
$num_args = $#ARGV + 1; | |
if ($num_args != 1) | |
{ | |
print "Usage of treefam_flatworm.pl\n\n"; | |
print "perl -w treefam_flatworm.pl <db>\n"; | |
print "where <db> says whether to use TreeFamA or TreeFamB.\n"; | |
print "For example, >perl -w treefam_flatworm.pl A\n"; | |
exit; | |
} | |
# READ IN MY PERL MODULES: | |
BEGIN { | |
unshift (@INC, '/nfs/team54/alc/perl/modules'); | |
} | |
# DECLARE MYSQL USERNAME AND HOST: | |
use Avril_modules; | |
# DECLARE MYSQL USERNAME AND HOST: | |
use DBI; | |
# CHECK WHETHER WE SHOULD USE TREEFAM A OR TREEFAM B: | |
$db = $ARGV[0]; | |
#------------------------------------------------------------------# | |
# FIND THE SPECIES OF EACH TREEFAM GENE: | |
# HASH TABLES TO REMEMBER THE SPECIES OF GENES: | |
%SPECIES = (); | |
$database = 'treefam_3'; | |
$dbh = DBI->connect("dbi:mysql:treefam_3:db.treefam.org:3308", 'anonymous', '') || return; | |
# SPECIFY THE TABLE: | |
$table_w = 'genes'; | |
# THE FIRST COLUMN IN THIS TABLE IS AN IDENTIFIER (IDX), AND | |
# THE SECOND COLUMN IS THE TRANSCRIPT NAME, AND THE LAST COLUMN | |
# IS THE TAXONOMY ID, eg. 1 ENSANGT00000032162.1 7165 | |
$st = "SELECT ID, TAX_ID from $table_w"; | |
$sth = $dbh->prepare($st) or die "Cannot prepare $st: $dbh->errstr\n"; | |
$rv = $sth->execute or die "Cannot execute the query: $sth->errstr"; | |
if ($rv >= 1) | |
{ | |
while ((@array) = $sth->fetchrow_array) { | |
$ID = $array[0]; # eg., ENST00000356572.1 | |
$TAXID = $array[1]; # eg 7165 | |
if ($TAXID eq '9606') { $SPECIES{$ID} = 'HS';} # HOMO SAPIENS | |
elsif ($TAXID eq '7227') { $SPECIES{$ID} = 'DM';} # DROSOPHILA MELANOGASTER | |
elsif ($TAXID eq '6239') { $SPECIES{$ID} = 'CE';} # CAENORHABDITIS ELEGANS | |
elsif ($TAXID eq '9031') { $SPECIES{$ID} = 'GG';} # GALLUS GALLUS | |
elsif ($TAXID eq '10116'){ $SPECIES{$ID} = 'RN';} # RATTUS NORVEGICUS | |
elsif ($TAXID eq '10090'){ $SPECIES{$ID} = 'MM';} # MUS MUSCULUS | |
elsif ($TAXID eq '6183') { $SPECIES{$ID} = 'SM';} # SCHISTOSOMA MANSONI | |
} | |
} | |
$rc = $dbh->disconnect(); | |
$rc = ""; | |
#------------------------------------------------------------------# | |
# GET THE NAMES OF ALL THE TREEFAM FAMILIES AND THE GENES THAT ARE IN | |
# THEM FROM THE MYSQL DATABASE: | |
%NUM = (); | |
$database = 'treefam_3'; | |
$dbh = DBI->connect("dbi:mysql:treefam_3:db.treefam.org:3308", 'anonymous', '') || return; | |
# SPECIFY THE TABLE: | |
if ($db eq 'A') # LOOK AT TREEFAM-A: | |
{ | |
$table_w = 'famA_gene'; | |
} | |
elsif ($db eq 'B') # LOOK AT TREEFAM-B: | |
{ | |
$table_w = 'famB_gene'; | |
} | |
else { print STDERR "ERROR: db is $db.\n"; exit;} | |
# THE FIRST THREE COLUMNS IN THE TABLE famB_gene/famA_gene ARE THE TRANSCRIPT NAME, FAMILY NAME AND WHETHER THE | |
# TRANSCRIPT IS IN THE SEED/FULL TREE: | |
# # eg., ENSMUST00000049178.2 TF105085 FULL | |
$st = "SELECT ID, AC, FLAG from $table_w"; | |
$sth = $dbh->prepare($st) or die "Cannot prepare $st: $dbh->errstr\n"; | |
$rv = $sth->execute or die "Cannot execute the query: $sth->errstr"; | |
if ($rv >= 1) | |
{ | |
while ((@array) = $sth->fetchrow_array) { | |
$ID = $array[0]; # eg., ENSMUST00000049178.2 | |
$AC = $array[1]; # eg., TF105085 | |
$FLAG = $array[2]; # eg., FULL OR BOTH | |
if ($FLAG eq 'FULL' || $FLAG eq 'BOTH') | |
{ | |
if ($SPECIES{$ID}) | |
{ | |
$species = $SPECIES{$ID}; | |
$key = $AC."_".$species; | |
if (!($NUM{$key})) { $NUM{$key} = 1; } | |
else { $NUM{$key}++; } | |
} | |
} | |
} | |
} | |
$rc = $dbh->disconnect(); | |
$rc = ""; | |
#------------------------------------------------------------------# | |
# FIND FAMILIES THAT HAVE JUST ONE SCHISTOSOMA MANSONI GENE: | |
$database = 'treefam_3'; | |
$dbh = DBI->connect("dbi:mysql:treefam_3:db.treefam.org:3308", 'anonymous', '') || return; | |
# SPECIFY THE TABLE: | |
if ($db eq 'A') # Use TreeFam-A | |
{ | |
$table_w = 'familyA'; | |
} | |
elsif ($db eq 'B') # Use TreeFam-B | |
{ | |
$table_w = 'familyB'; | |
} | |
else { print STDERR "ERROR: db is $db.\n"; exit;} | |
# THE FIRST COLUMN IN THE TABLE IS THE FAMILY ACCESSION NUMBER eg. TF105036: | |
$st = "SELECT AC from $table_w"; | |
$sth = $dbh->prepare($st) or die "Cannot prepare $st: $dbh->errstr\n"; | |
$rv = $sth->execute or die "Cannot execute the query: $sth->errstr"; | |
if ($rv >= 1) | |
{ | |
while ((@array) = $sth->fetchrow_array) | |
{ | |
$AC = $array[0]; | |
$HS = 0; $DM = 0; $CE = 0; $GG = 0; $RN = 0; $MM = 0; $SM = 0; | |
$key = $AC."_HS"; if ($NUM{$key}) { $HS = $NUM{$key};} | |
$key = $AC."_DM"; if ($NUM{$key}) { $DM = $NUM{$key};} | |
$key = $AC."_CE"; if ($NUM{$key}) { $CE = $NUM{$key};} | |
$key = $AC."_GG"; if ($NUM{$key}) { $GG = $NUM{$key};} | |
$key = $AC."_RN"; if ($NUM{$key}) { $RN = $NUM{$key};} | |
$key = $AC."_MM"; if ($NUM{$key}) { $MM = $NUM{$key};} | |
$key = $AC."_SM"; if ($NUM{$key}) { $SM = $NUM{$key};} | |
# IF THERE IS ONE SCHISTOSOMA GENE AND MORE THAN TWO GENES IN MOST | |
# OF THE OTHER SPECIES: | |
if ($SM == 1 && $HS > 2 && $DM > 2 && $CE > 2 && $GG > 2 && $RN > 2 && $MM > 2) | |
{ | |
print "$AC\n"; | |
} | |
} | |
} | |
$rc = $dbh->disconnect(); | |
$rc = ""; | |
#------------------------------------------------------------------# | |
print STDERR "FINISHED.\n"; | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment