Created
March 1, 2013 14:25
-
-
Save avrilcoghlan/5064977 to your computer and use it in GitHub Desktop.
Perl script that retrieves the full nucleotide alignments for TreeFam families, and stores them in a Perl pickle
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/local/bin/perl | |
# | |
# Perl script store_treefam_full_nt_alns.pl | |
# Written by Avril Coghlan ([email protected]) | |
# 3-Apr-09. | |
# | |
# For the TreeFam project. | |
# | |
# This perl script connects to the TreeFam database and stores | |
# the full nucleotide alignments in a pickle. | |
# | |
# The command-line format is: | |
# % perl <store_treefam_full_nt_alns.pl> version | |
# where version is the version of the TreeFam database to use. | |
# | |
#------------------------------------------------------------------# | |
# CHECK IF THERE ARE THE CORRECT NUMBER OF COMMAND-LINE ARGUMENTS: | |
$num_args = $#ARGV + 1; | |
if ($num_args != 1) | |
{ | |
print "Usage of store_treefam_full_nt_alns.pl\n\n"; | |
print "perl store_treefam_full_nt_alns.pl <version>\n"; | |
print "where <version> is the version of the TreeFam database to use.\n"; | |
print "For example, >perl -w store_treefam_full_nt_alns.pl 7\n"; | |
exit; | |
} | |
# FIND THE RELEASE OF TREEFAM TO USE: | |
$version = $ARGV[0]; | |
# READ IN MY PERL MODULES: | |
use Avril_modules; | |
use Treefam::DBConnection; | |
use DBI; | |
use Storable; | |
$VERBOSE = 0; | |
#------------------------------------------------------------------# | |
# GET A LIST OF ALL FAMILIES: | |
$cnt = 0; | |
$database = "dbi:mysql:treefam_".$version.":db.treefam.org:3308"; | |
$dbh = DBI->connect("$database", 'anonymous', '') || return; | |
$table_w = 'familyA'; | |
$st = "SELECT AC from $table_w"; | |
$sth = $dbh->prepare($st) or die "Cannot prepare $st: $dbh->errstr\n"; | |
$rv = $sth->execute or die "Cannot execute the query: $sth->errstr"; | |
if ($rv >= 1) | |
{ | |
while ((@array) = $sth->fetchrow_array) | |
{ | |
$cnt++; | |
$AC = $array[0]; | |
@families = (@families,$AC); | |
print "$cnt: Got $AC\n"; | |
} | |
} | |
print "Read in list of TreeFam-A families...\n"; | |
# GET A LIST OF ALL TREEFAM-B FAMILIES: | |
$dbh = DBI->connect("$database", 'anonymous', '') || return; | |
$table_w = 'familyB'; | |
$st = "SELECT AC from $table_w"; | |
$sth = $dbh->prepare($st) or die "Cannot prepare $st: $dbh->errstr\n"; | |
$rv = $sth->execute or die "Cannot execute the query: $sth->errstr"; | |
if ($rv >= 1) | |
{ | |
while ((@array) = $sth->fetchrow_array) | |
{ | |
$cnt++; | |
$AC = $array[0]; | |
@families = (@families,$AC); | |
print "$cnt: Got $AC\n"; | |
} | |
} | |
$rc = $dbh->disconnect(); | |
$rc = ""; | |
print "Read in list of TreeFam-B families...\n"; | |
# STORE THE FULL NUCLEOTIDE ALIGNMENTS IN A PICKLE FILE: | |
$dircnt = 1; | |
$outputdir = "alignments".$dircnt; | |
system "mkdir $outputdir" || die "ERROR: cannot make directory $outputdir\n"; | |
%ALNDIR = (); | |
$dbc = Treefam::DBConnection->new(); | |
for ($i = 0; $i <= $#families; $i++) | |
{ | |
$treefam_family = $families[$i]; | |
$cnt = $i + 1; | |
print "$cnt: looking at $treefam_family\n"; | |
if ($cnt % 500 == 0) | |
{ | |
$dircnt++; | |
$outputdir = "alignments".$dircnt; | |
system "mkdir $outputdir" || die "ERROR: cannot make directory $outputdir\n"; | |
} | |
if ($treefam_family eq '') { print STDERR "ERROR: treefam_family $treefam_family\n"; exit;} | |
$famh = $dbc->get_FamilyHandle(); | |
if (!(defined($famh->get_by_id($treefam_family)))) | |
{ | |
print "WARNING: there is no family $treefam_family...\n"; | |
goto NEXT_FAMILY; | |
} | |
$family = $famh->get_by_id($treefam_family); | |
if (!(defined($family->ID()))) | |
{ | |
print "WARNING: do not have ID stored for family $treefam_family...\n"; | |
goto NEXT_FAMILY; | |
} | |
$AC = $family->ID(); # GET THE FAMILY ID. | |
if ($AC ne $treefam_family) { print STDERR "ERROR: AC $AC treefam_family $treefam_family\n"; exit;} | |
# GET THE FULL TREE FOR THE FAMILY: | |
if (!(defined($family->get_tree('full')))) | |
{ | |
print "WARNING: AC $AC: there is no full tree!\n"; | |
} | |
else | |
{ | |
$fulltree = $family->get_tree('full'); | |
# GET THE NUCLEOTIDE ALIGNMENT FOR THIS FULL TREE: | |
if (!(defined($fulltree->get_alignment('nt')))) | |
{ | |
print "WARNING: AC $AC: there is no alignment!\n"; | |
goto NEXT_FAMILY; | |
} | |
$aln = $fulltree->get_alignment('nt'); | |
# WRITE THE ALIGNMENT TO A FILE: | |
$outputfile = $outputdir."/".$AC.".aln"; | |
system "rm -f $outputfile"; | |
open(OUT,">$outputfile") || die "ERROR: cannot open $outputfile\n"; | |
print OUT "$aln\n"; | |
close(OUT); | |
# ZIP UP THE FILE: | |
system "gzip $outputfile"; | |
system "rm -f $outputfile"; | |
# RECORD THE DIRECTORY THE ALIGNMENT WAS PUT INTO: | |
if ($ALNDIR{$AC}) { print STDERR "ERROR: already have alndir for $AC\n"; exit;} | |
$ALNDIR{$AC} = $outputdir; | |
} | |
NEXT_FAMILY: | |
} | |
print STDERR "Read in trees...\n"; | |
# STORE THE HASH TABLE %ALNDIR IN A PICKLE: | |
$output1 = "treefam.".$version."_fullntalnsdir"; | |
store \%ALNDIR,$output1; | |
#------------------------------------------------------------------# | |
print STDERR "FINISHED.\n"; | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment