Created
March 1, 2013 16:43
-
-
Save avrilcoghlan/5065941 to your computer and use it in GitHub Desktop.
Perl script that connects to the TreeFam mysql database, and finds Schistosoma mansoni genes that are multi-copy, but that have just one or two orthologs in most other animals.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/local/bin/perl | |
# | |
# Perl script treefam_flatworm2.pl | |
# Written by Avril Coghlan ([email protected]). | |
# 30-Jun-06. | |
# | |
# This perl script connects to the MYSQL database of | |
# TreeFam families and finds flatworm genes that are | |
# multi-copy, that have 1 or 2 orthologs in most | |
# other animals. | |
# | |
# The command-line format is: | |
# % perl <treefam_flatworm2.pl> <db> | |
# where db says whether to use TreeFamA or TreeFamB. | |
# | |
#------------------------------------------------------------------# | |
# CHECK IF THERE ARE THE CORRECT NUMBER OF COMMAND-LINE ARGUMENTS: | |
$num_args = $#ARGV + 1; | |
if ($num_args != 1) | |
{ | |
print "Usage of treefam_flatworm2.pl\n\n"; | |
print "perl -w treefam_flatworm2.pl <db>\n"; | |
print "where <db> says whether to use TreeFamA or TreeFamB.\n"; | |
print "For example, >perl -w treefam_flatworm2.pl A\n"; | |
exit; | |
} | |
# READ IN MY PERL MODULES: | |
BEGIN { | |
unshift (@INC, '/nfs/team54/alc/perl/modules'); | |
} | |
# DECLARE MYSQL USERNAME AND HOST: | |
use Avril_modules; | |
# DECLARE MYSQL USERNAME AND HOST: | |
use DBI; | |
# CHECK WHETHER WE SHOULD USE TREEFAM A OR TREEFAM B: | |
$db = $ARGV[0]; | |
#------------------------------------------------------------------# | |
# FIND THE SPECIES OF EACH TREEFAM GENE: | |
# HASH TABLES TO REMEMBER THE SPECIES OF GENES: | |
%SPECIES = (); | |
$database = 'treefam_3'; | |
$dbh = DBI->connect("dbi:mysql:treefam_3:db.treefam.org:3308", 'anonymous', '') || return; | |
# SPECIFY THE TABLE: | |
$table_w = 'genes'; | |
# THE FIRST COLUMN IN THIS TABLE IS AN IDENTIFIER (IDX), AND | |
# THE SECOND COLUMN IS THE TRANSCRIPT NAME, AND THE LAST COLUMN | |
# IS THE TAXONOMY ID, eg. 1 ENSANGT00000032162.1 7165 | |
$st = "SELECT ID, TAX_ID from $table_w"; | |
$sth = $dbh->prepare($st) or die "Cannot prepare $st: $dbh->errstr\n"; | |
$rv = $sth->execute or die "Cannot execute the query: $sth->errstr"; | |
if ($rv >= 1) | |
{ | |
while ((@array) = $sth->fetchrow_array) { | |
$ID = $array[0]; # eg., ENST00000356572.1 | |
$TAXID = $array[1]; # eg 7165 | |
if ($TAXID eq '9606') { $SPECIES{$ID} = 'HS';} # HOMO SAPIENS | |
elsif ($TAXID eq '7227') { $SPECIES{$ID} = 'DM';} # DROSOPHILA MELANOGASTER | |
elsif ($TAXID eq '6239') { $SPECIES{$ID} = 'CE';} # CAENORHABDITIS ELEGANS | |
elsif ($TAXID eq '9031') { $SPECIES{$ID} = 'GG';} # GALLUS GALLUS | |
elsif ($TAXID eq '10116'){ $SPECIES{$ID} = 'RN';} # RATTUS NORVEGICUS | |
elsif ($TAXID eq '10090'){ $SPECIES{$ID} = 'MM';} # MUS MUSCULUS | |
elsif ($TAXID eq '6183') { $SPECIES{$ID} = 'SM';} # SCHISTOSOMA MANSONI | |
} | |
} | |
$rc = $dbh->disconnect(); | |
$rc = ""; | |
#------------------------------------------------------------------# | |
# GET THE NAMES OF ALL THE TREEFAM FAMILIES AND THE GENES THAT ARE IN | |
# THEM FROM THE MYSQL DATABASE: | |
%NUM = (); | |
$database = 'treefam_3'; | |
$dbh = DBI->connect("dbi:mysql:treefam_3:db.treefam.org:3308", 'anonymous', '') || return; | |
# SPECIFY THE TABLE: | |
if ($db eq 'A') # LOOK AT TREEFAM-A: | |
{ | |
$table_w = 'famA_gene'; | |
} | |
elsif ($db eq 'B') # LOOK AT TREEFAM-B: | |
{ | |
$table_w = 'famB_gene'; | |
} | |
else { print STDERR "ERROR: db is $db.\n"; exit;} | |
# THE FIRST THREE COLUMNS IN THE TABLE famB_gene/famA_gene ARE THE TRANSCRIPT NAME, FAMILY NAME AND WHETHER THE | |
# TRANSCRIPT IS IN THE SEED/FULL TREE: | |
# # eg., ENSMUST00000049178.2 TF105085 FULL | |
$st = "SELECT ID, AC, FLAG from $table_w"; | |
$sth = $dbh->prepare($st) or die "Cannot prepare $st: $dbh->errstr\n"; | |
$rv = $sth->execute or die "Cannot execute the query: $sth->errstr"; | |
if ($rv >= 1) | |
{ | |
while ((@array) = $sth->fetchrow_array) { | |
$ID = $array[0]; # eg., ENSMUST00000049178.2 | |
$AC = $array[1]; # eg., TF105085 | |
$FLAG = $array[2]; # eg., FULL OR BOTH | |
if ($FLAG eq 'FULL' || $FLAG eq 'BOTH') | |
{ | |
if ($SPECIES{$ID}) | |
{ | |
$species = $SPECIES{$ID}; | |
$key = $AC."_".$species; | |
if (!($NUM{$key})) { $NUM{$key} = 1; } | |
else { $NUM{$key}++; } | |
} | |
} | |
} | |
} | |
$rc = $dbh->disconnect(); | |
$rc = ""; | |
#------------------------------------------------------------------# | |
# FIND FAMILIES THAT HAVE MANY SCHISTOSOMA MANSONI GENES, BUT | |
# JUST ONE OR TWO GENES IN OTHER SPECIES: | |
$database = 'treefam_3'; | |
$dbh = DBI->connect("dbi:mysql:treefam_3:db.treefam.org:3308", 'anonymous', '') || return; | |
# SPECIFY THE TABLE: | |
if ($db eq 'A') # Use TreeFam-A | |
{ | |
$table_w = 'familyA'; | |
} | |
elsif ($db eq 'B') # Use TreeFam-B | |
{ | |
$table_w = 'familyB'; | |
} | |
else { print STDERR "ERROR: db is $db.\n"; exit;} | |
# THE FIRST COLUMN IN THE TABLE IS THE FAMILY ACCESSION NUMBER eg. TF105036: | |
$st = "SELECT AC from $table_w"; | |
$sth = $dbh->prepare($st) or die "Cannot prepare $st: $dbh->errstr\n"; | |
$rv = $sth->execute or die "Cannot execute the query: $sth->errstr"; | |
if ($rv >= 1) | |
{ | |
while ((@array) = $sth->fetchrow_array) | |
{ | |
$AC = $array[0]; | |
$HS = 0; $DM = 0; $CE = 0; $GG = 0; $RN = 0; $MM = 0; $SM = 0; | |
$key = $AC."_HS"; if ($NUM{$key}) { $HS = $NUM{$key};} | |
$key = $AC."_DM"; if ($NUM{$key}) { $DM = $NUM{$key};} | |
$key = $AC."_CE"; if ($NUM{$key}) { $CE = $NUM{$key};} | |
$key = $AC."_GG"; if ($NUM{$key}) { $GG = $NUM{$key};} | |
$key = $AC."_RN"; if ($NUM{$key}) { $RN = $NUM{$key};} | |
$key = $AC."_MM"; if ($NUM{$key}) { $MM = $NUM{$key};} | |
$key = $AC."_SM"; if ($NUM{$key}) { $SM = $NUM{$key};} | |
# IF THERE ARE >= 3 SCHISTOSOMA GENES AND LESS THAN OR EQUAL TO TWO GENES IN MOST | |
# OF THE OTHER SPECIES: | |
if ($SM >= 3 && ($HS == 1 || $HS == 2) && ($DM == 1 || $DM == 2) && | |
($CE == 1 || $CE == 2) && ($GG == 1 || $GG == 2) && | |
($RN == 1 || $RN == 2) && ($MM == 1 || $MM == 2)) | |
{ | |
print "$AC\n"; | |
} | |
# IF THERE ARE >= 2 SCHISTOSOMA GENES AND ONE GENE IN MOST OTHER SPECIES: | |
elsif ($SM == 2 && $HS == 1 && $DM == 1 && $CE == 1 && $GG == 1 && $RN == 1 && $MM == 1) | |
{ | |
print "$AC\n"; | |
} | |
} | |
} | |
$rc = $dbh->disconnect(); | |
$rc = ""; | |
#------------------------------------------------------------------# | |
print STDERR "FINISHED.\n"; | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment