Created
March 1, 2013 14:00
-
-
Save avrilcoghlan/5064833 to your computer and use it in GitHub Desktop.
Perl script that, given a list of Schistosoma mansoni genes, connects to the TreeFam database to find out which families they are in
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/local/bin/perl | |
# | |
# Perl script find_treefam_for_schisto_gene2.pl | |
# Written by Avril Coghlan ([email protected]) | |
# 21-Mar-09. | |
# | |
# This perl script reads in a list of Schistosoma genes, and connects | |
# to the TreeFam mysql database to find out which family they are in. | |
# | |
# The command-line format is: | |
# % perl <find_treeFam_for_schisto_gene2.pl> version list | |
# where version is the version of the TreeFam database to use, | |
# list is the list of Schistosoma genes. | |
# | |
#------------------------------------------------------------------# | |
# CHECK IF THERE ARE THE CORRECT NUMBER OF COMMAND-LINE ARGUMENTS: | |
$num_args = $#ARGV + 1; | |
if ($num_args != 2) | |
{ | |
print "Usage of find_treefam_for_schisto_gene2.pl\n\n"; | |
print "perl find_treefam_for_schisto_gene2.pl <version> <list>\n"; | |
print "where <version> is the version of the TreeFam database to use,\n"; | |
print " <list> is the list of Schistosoma genes.\n"; | |
print "For example, >perl find_treefam_for_schisto_gene2.pl 7 Table20.txt\n"; | |
exit; | |
} | |
# DECLARE MYSQL USERNAME AND HOST: | |
use DBI; | |
# FIND THE VERSION OF THE TREEFAM DATABASE TO USE: | |
$version = $ARGV[0]; | |
# FIND THE NAME OF THE INPUT LIST OF SCHISTOSOMA GENES: | |
$list = $ARGV[1]; | |
#------------------------------------------------------------------# | |
# CONNECT TO THE TREEFAM DATABASE: | |
$database = "dbi:mysql:treefam_".$version.":db.treefam.org:3308"; | |
$dbh = DBI->connect("$database", 'anonymous', '') || return | |
# READ IN THE LIST OF SCHISTOSOMA GENES: | |
print STDERR "Opening file $list...\n"; | |
open(LIST,"$list") || die "ERROR: cannot open $list\n"; | |
while(<LIST>) | |
{ | |
$line = $_; | |
chomp $line; | |
@temp = split(/\s+/,$line); | |
$schisto = $temp[0]; | |
# GET THE TREEFAM FAMILY FOR THIS SCHISTOSOMA TRANSCRIPT: | |
$fam = &get_fam($schisto,$dbh); | |
@fams = split(/\,/,$fam); | |
@fams = sort @fams; | |
if ($#fams > 0) { print STDERR "WARNING: $schisto: several families ($fam)\n";} | |
print "$schisto"; | |
for ($i = 0; $i <= $#fams; $i++) | |
{ | |
if ($fams[$i] ne 'none') | |
{ | |
$link = "http://www.treefam.org/cgi-bin/TFinfo.pl?ac=".$fams[$i]; | |
$hyperlink = "=HYPERLINK(\"$link\",\"$fams[$i]\")"; | |
print " $hyperlink"; | |
} | |
} | |
if ($#fams == 0 && $fams[0] eq 'none') { print " none";} | |
print "\n"; | |
} | |
close(LIST); | |
$rc = $dbh->disconnect(); | |
$rc = ""; | |
#------------------------------------------------------------------# | |
print STDERR "FINISHED.\n"; | |
#------------------------------------------------------------------# | |
# GET THE TREEFAM FAMILY FOR A SCHISTOSOMA TRANSCRIPT: | |
sub get_fam | |
{ | |
my $transcript = $_[0]; | |
my $dbh = $_[1]; | |
my $table_w; | |
my $gene; | |
my $st; | |
my $sth; | |
my $rv; | |
my @array; | |
my $fam = 'none'; | |
my $fam2 = 'none'; | |
my @temp; | |
my $transcript2; | |
my $gene2; | |
my %HAVE = (); | |
@temp = split(/\./,$transcript); # eg. Smp_028440.1 | |
$gene = $temp[0]; # eg. Smp_028440 | |
$table_w = 'fam_genes'; | |
$st = "SELECT AC, ID from $table_w WHERE ID LIKE \'$gene\%\'"; | |
$sth = $dbh->prepare($st) or die "Cannot prepare $st: $dbh->errstr\n"; | |
$rv = $sth->execute or die "Cannot execute the query: $sth->errstr"; | |
if ($rv >= 1) | |
{ | |
while ((@array) = $sth->fetchrow_array) | |
{ | |
$transcript2 = $array[1]; | |
@temp = split(/\./,$transcript2); | |
$gene2 = $temp[0]; | |
if ($gene2 eq $gene) | |
{ | |
# RECORD THE FAMILIES THAT THIS GENE APPEARS IN: | |
if (!($HAVE{$array[0]})) # IF WE HAVEN'T ALREADY RECORDED THIS FAMILY FOR THIS GENE: | |
{ | |
$HAVE{$array[0]} = 1; | |
# DIFFERENT TRANSCRIPTS OF A GENE CAN HAVE BEST MATCHES TO DIFFERENT FAMILIES: | |
if ($fam eq 'none') { $fam = $array[0]; } | |
else | |
{ | |
$fam = $fam.",".$array[0]; | |
} | |
} | |
# RECORD THE FAMILIES FOR THIS TRANSCRIPT: | |
if ($transcript2 eq $transcript) | |
{ | |
if ($fam2 eq 'none') { $fam2 = $array[0]; } | |
else | |
{ | |
$fam2 = $fam2.",".$array[0]; | |
} | |
} | |
} | |
} | |
} | |
# CHECK IF WE FOUND A FAMILY FOR THIS TRANSCRIPT: | |
if ($fam2 ne 'none') | |
{ | |
return($fam2); | |
} | |
else | |
{ | |
return($fam); | |
} | |
} | |
#------------------------------------------------------------------# | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment