Skip to content

Instantly share code, notes, and snippets.

@avrilcoghlan
Created March 1, 2013 14:25
Show Gist options
  • Save avrilcoghlan/5064977 to your computer and use it in GitHub Desktop.
Save avrilcoghlan/5064977 to your computer and use it in GitHub Desktop.
Perl script that retrieves the full nucleotide alignments for TreeFam families, and stores them in a Perl pickle
#!/usr/local/bin/perl
#
# Perl script store_treefam_full_nt_alns.pl
# Written by Avril Coghlan ([email protected])
# 3-Apr-09.
#
# For the TreeFam project.
#
# This perl script connects to the TreeFam database and stores
# the full nucleotide alignments in a pickle.
#
# The command-line format is:
# % perl <store_treefam_full_nt_alns.pl> version
# where version is the version of the TreeFam database to use.
#
#------------------------------------------------------------------#
# CHECK IF THERE ARE THE CORRECT NUMBER OF COMMAND-LINE ARGUMENTS:
$num_args = $#ARGV + 1;
if ($num_args != 1)
{
print "Usage of store_treefam_full_nt_alns.pl\n\n";
print "perl store_treefam_full_nt_alns.pl <version>\n";
print "where <version> is the version of the TreeFam database to use.\n";
print "For example, >perl -w store_treefam_full_nt_alns.pl 7\n";
exit;
}
# FIND THE RELEASE OF TREEFAM TO USE:
$version = $ARGV[0];
# READ IN MY PERL MODULES:
use Avril_modules;
use Treefam::DBConnection;
use DBI;
use Storable;
$VERBOSE = 0;
#------------------------------------------------------------------#
# GET A LIST OF ALL FAMILIES:
$cnt = 0;
$database = "dbi:mysql:treefam_".$version.":db.treefam.org:3308";
$dbh = DBI->connect("$database", 'anonymous', '') || return;
$table_w = 'familyA';
$st = "SELECT AC from $table_w";
$sth = $dbh->prepare($st) or die "Cannot prepare $st: $dbh->errstr\n";
$rv = $sth->execute or die "Cannot execute the query: $sth->errstr";
if ($rv >= 1)
{
while ((@array) = $sth->fetchrow_array)
{
$cnt++;
$AC = $array[0];
@families = (@families,$AC);
print "$cnt: Got $AC\n";
}
}
print "Read in list of TreeFam-A families...\n";
# GET A LIST OF ALL TREEFAM-B FAMILIES:
$dbh = DBI->connect("$database", 'anonymous', '') || return;
$table_w = 'familyB';
$st = "SELECT AC from $table_w";
$sth = $dbh->prepare($st) or die "Cannot prepare $st: $dbh->errstr\n";
$rv = $sth->execute or die "Cannot execute the query: $sth->errstr";
if ($rv >= 1)
{
while ((@array) = $sth->fetchrow_array)
{
$cnt++;
$AC = $array[0];
@families = (@families,$AC);
print "$cnt: Got $AC\n";
}
}
$rc = $dbh->disconnect();
$rc = "";
print "Read in list of TreeFam-B families...\n";
# STORE THE FULL NUCLEOTIDE ALIGNMENTS IN A PICKLE FILE:
$dircnt = 1;
$outputdir = "alignments".$dircnt;
system "mkdir $outputdir" || die "ERROR: cannot make directory $outputdir\n";
%ALNDIR = ();
$dbc = Treefam::DBConnection->new();
for ($i = 0; $i <= $#families; $i++)
{
$treefam_family = $families[$i];
$cnt = $i + 1;
print "$cnt: looking at $treefam_family\n";
if ($cnt % 500 == 0)
{
$dircnt++;
$outputdir = "alignments".$dircnt;
system "mkdir $outputdir" || die "ERROR: cannot make directory $outputdir\n";
}
if ($treefam_family eq '') { print STDERR "ERROR: treefam_family $treefam_family\n"; exit;}
$famh = $dbc->get_FamilyHandle();
if (!(defined($famh->get_by_id($treefam_family))))
{
print "WARNING: there is no family $treefam_family...\n";
goto NEXT_FAMILY;
}
$family = $famh->get_by_id($treefam_family);
if (!(defined($family->ID())))
{
print "WARNING: do not have ID stored for family $treefam_family...\n";
goto NEXT_FAMILY;
}
$AC = $family->ID(); # GET THE FAMILY ID.
if ($AC ne $treefam_family) { print STDERR "ERROR: AC $AC treefam_family $treefam_family\n"; exit;}
# GET THE FULL TREE FOR THE FAMILY:
if (!(defined($family->get_tree('full'))))
{
print "WARNING: AC $AC: there is no full tree!\n";
}
else
{
$fulltree = $family->get_tree('full');
# GET THE NUCLEOTIDE ALIGNMENT FOR THIS FULL TREE:
if (!(defined($fulltree->get_alignment('nt'))))
{
print "WARNING: AC $AC: there is no alignment!\n";
goto NEXT_FAMILY;
}
$aln = $fulltree->get_alignment('nt');
# WRITE THE ALIGNMENT TO A FILE:
$outputfile = $outputdir."/".$AC.".aln";
system "rm -f $outputfile";
open(OUT,">$outputfile") || die "ERROR: cannot open $outputfile\n";
print OUT "$aln\n";
close(OUT);
# ZIP UP THE FILE:
system "gzip $outputfile";
system "rm -f $outputfile";
# RECORD THE DIRECTORY THE ALIGNMENT WAS PUT INTO:
if ($ALNDIR{$AC}) { print STDERR "ERROR: already have alndir for $AC\n"; exit;}
$ALNDIR{$AC} = $outputdir;
}
NEXT_FAMILY:
}
print STDERR "Read in trees...\n";
# STORE THE HASH TABLE %ALNDIR IN A PICKLE:
$output1 = "treefam.".$version."_fullntalnsdir";
store \%ALNDIR,$output1;
#------------------------------------------------------------------#
print STDERR "FINISHED.\n";
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment