Created
March 1, 2013 16:26
-
-
Save avrilcoghlan/5065796 to your computer and use it in GitHub Desktop.
Perl script that connects to the TreeFam mysql database, and parses the trees using Bioperl.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/local/bin/perl | |
# | |
# Perl script parse_treefam_bioperl.pl | |
# Written by Avril Coghlan ([email protected]). | |
# 23-Feb-06. | |
# | |
# For the TreeFam project. | |
# | |
# This perl script connects to the MYSQL database of TreeFam families and | |
# parses the trees using Bioperl. | |
# | |
# The command-line format is: | |
# % perl <parse_treefam_bioperl.pl> tree_type db | |
# where tree_type is the type of tree to look at (CLEAN/FULL/SEED/all), | |
# db is the TreeFam database to look at (A/B/all). | |
# | |
#------------------------------------------------------------------# | |
# CHECK IF THERE ARE THE CORRECT NUMBER OF COMMAND-LINE ARGUMENTS: | |
$num_args = $#ARGV + 1; | |
if ($num_args != 2) | |
{ | |
print "Usage of parse_treefam_bioperl.pl <tree_type> <db>\n\n"; | |
print "where <tree_type> is the type of tree to look at (CLEAN/FULL/SEED/all),\n"; | |
print " <db> is the TreeFam database to look at (A/B/all).\n"; | |
print "perl -w parse_treefam_bioperl.pl\n"; | |
print "For example, >perl -w parse_treefam_bioperl.pl SEED A\n"; | |
exit; | |
} | |
# DECLARE MYSQL USERNAME AND HOST: | |
use DBI; | |
use lib "/nfs/disk100/wormpub2/Avril/downloadnov05/bioperl/bioperl-1.5.1/"; | |
use Bio::TreeIO; | |
use IO::String; | |
# FIND THE TYPE OF TREE TO LOOK AT: | |
$tree_type = $ARGV[0]; | |
if ($tree_type ne 'CLEAN' && $tree_type ne 'FULL' && $tree_type ne 'SEED' && $tree_type ne 'all') | |
{ | |
print STDERR "ERROR: tree_type should be CLEAN/FULL/SEED/all.\n"; | |
exit; | |
} | |
# FIND THE TREEFAM DATABASE TO LOOK AT: | |
$db = $ARGV[1]; | |
if ($db ne 'A' && $db ne 'B') | |
{ | |
print STDERR "ERROR: db should be A/B/all.\n"; | |
exit; | |
} | |
#------------------------------------------------------------------# | |
# GET THE TREEFAM-A TREES FROM THE MYSQL DATABASE: | |
$dbh = DBI->connect("dbi:mysql:treefam_2:db.treefam.org:3308", 'anonymous', '') || return; | |
$table_w = 'trees'; | |
# THIS TABLE HAS THE ACCESSION NUMBER, TYPE OF TREE (SEED/FULL/WHOLE/CLEAN) and the TREE ITSELF (NEWICK FORMAT). | |
$st = "SELECT AC, TYPE, TREE from $table_w"; | |
$sth = $dbh->prepare($st) or die "Cannot prepare $st: $dbh->errstr\n"; | |
$rv = $sth->execute or die "Cannot execute the query: $sth->errstr"; | |
if ($rv >= 1) | |
{ | |
while ((@array) = $sth->fetchrow_array) { | |
$AC = $array[0]; # eg., TF101001 | |
$TYPE = $array[1]; # eg., SEED | |
$TREE = $array[2]; # THE TREE IN NEWICK FORMAT. | |
# NOW READ IN THE NEWICK FORMAT TREE: | |
if ($AC eq '') { print STDERR "ERROR: AC is $AC.\n"; exit;} | |
if ($TYPE eq '') { print STDERR "ERROR: TYPE is $TYPE.\n"; exit;} | |
# CHECK THAT THIS IS THE TYPE OF TREE (CLEAN/FULL/SEED/all) THAT WE WANT TO LOOK AT: | |
if ($TYPE eq $tree_type || $tree_type eq 'all') | |
{ | |
# CHECK THAT THIS TREE COMES FROM THE TREEFAM DATABASE THAT WE WANT TO LOOK AT (A/B/all): | |
if ((substr($AC,0,3) eq 'TF1' && $db eq 'A') || (substr($AC,0,3) eq 'TF3' && $db eq 'B') || $db eq 'all') | |
{ | |
# CHECK THE NHX FORMAT OF THE TREE IS FINE: | |
$TREE = &check_nhx_format($TREE,$AC,$TYPE); | |
# PARSE THE TREE USING THE BIOPERL TREE PARSER: | |
if ($TREE ne 'none') # IF THE FORMAT OF THE TREE IS OK. | |
{ | |
$parsed = &read_tree($TREE,$AC,$TYPE); | |
if ($parsed == 1) | |
{ | |
print STDERR "Tree $AC $TYPE parsed correctly...\n"; | |
} | |
} | |
} | |
} | |
} | |
} | |
$rc = $dbh->disconnect(); | |
$rc = ""; | |
#------------------------------------------------------------------# | |
print STDERR "FINISHED.\n"; | |
#------------------------------------------------------------------# | |
# SUBROUTINE TO CHECK THE NHX FORMAT OF THE TREE, TO SEE IF IT WILL | |
# BE ABLE TO BE READ BY THE BIOPERL TREE PARSER. SPECIFICALLY, THIS | |
# CHECKS TO SEE IF THE TREE CONTAINS ANY INCORRECT SQUARE BRACKETS, ie. | |
# [&&NHX] (WITHOUT ANY TAG INFORMATION). | |
# 23-FEB-06. | |
sub check_nhx_format | |
{ | |
my $tree = $_[0]; #> THE PHYLOGENETIC TREE. | |
my $AC = $_[1]; #> THE TREEFAM ACCESSION NUMBER. | |
my $TYPE = $_[2]; #> THE TYPE OF TREEFAM TREE. | |
my $length = length($tree); #> LENGTH OF $tree. | |
my $i; #> | |
my $take = 1; #> | |
my $char; #> A CHARACTER IN $tree. | |
my $bracket = ""; #> | |
my $new_tree = ""; #> | |
my $seen_semicolon = 0; #> SAYS WHETHER THERE IS A ; AT THE END OF THE TREE. | |
# LOOK AT THE TREE $tree, CHARACTER-BY-CHARACTER: | |
for ($i = 0; $i < $length; $i++) | |
{ | |
$char = substr($tree,$i,1); | |
# CHECK WHETHER THIS IS A SEMICOLON: | |
if ($char eq ';') { $seen_semicolon = 1;} | |
# RECORD THE CHARACTERS WITHIN SQUARE BRACKETS: | |
if ($take == 0 || $char eq '[' || $char eq ']') { $bracket = $bracket.$char;} | |
# CHECK IF THIS IS THE START OR END OF A REGION WITHIN SQUARE BRACKETS: | |
if ($char eq '[') # THE START OF A NHX TAG. | |
{ | |
$take = 0; | |
} | |
elsif ($char eq ']') # THE END OF A NHX TAG. | |
{ | |
$take = 1; | |
# CHECK IF THERE IS TAG INFORMATION WITHIN THE BRACKETED REGION: | |
if ($bracket eq '[&&NHX]') | |
{ | |
$bracket = '[&&NHX:S=Hello]'; | |
} | |
$new_tree = $new_tree.$bracket; | |
$bracket = ""; | |
} | |
# CHECK IF THIS IS A REGION OUTSIDE SQUARE BRACKETS: | |
if ($take == 1 && $char ne ']') | |
{ | |
$new_tree = $new_tree.$char; | |
} | |
} | |
# CHECK WHETHER THE TREE ENDED EARLY: | |
if ($seen_semicolon == 0) | |
{ | |
print STDERR "ERROR: tree $AC $TYPE is truncated.\n"; | |
print "ERROR: tree $AC $TYPE is truncated.\n"; | |
return "none"; | |
} | |
return $new_tree; | |
} | |
#------------------------------------------------------------------# | |
# SUBROUTINE TO READ IN THE NEWICK FORMAT TREE, AND PARSE IT USING BIOPERL. | |
# 23-FEB-06. | |
sub read_tree | |
{ | |
my $tree = $_[0]; #> THE TREE IN NEWICK FORMAT. | |
my $AC = $_[1]; #> THE ACCESSION NUMBER FOR THE TREEFAM FAMILY. | |
my $TYPE = $_[2]; #> THE TYPE OF TREE (SEED/FULL/CLEAN). | |
my $treefh; #> | |
my $input; #> | |
my $new_tree = ''; #> | |
my $rootnode; #> | |
my $parent; #> | |
my %PARENT = (); #> HASH TABLE OF THE PARENTS OF NODES. | |
my %SPLITS_INTO = (); #> HASH TABLE OF THE DAUGHTERS OF NODES. | |
my %SEQ = (); #> HASH TABLE OF SEQUENCE NAMES FOR LEAVES OF THE TREE. | |
my $node; #> | |
my @nodes; #> | |
my $seq; #> | |
my $PRINT_FORK_DETAILS = 0; #> IF 1, PRINT DETAILS OF THE BRANCHING OF THE TREE. | |
my $nodes; #> | |
my $i; #> | |
# PARSE THE TREE USING BIOPERL. THE TREE IS IN EXTENDED NEWICK FORMAT. | |
$treefh = new IO::String($tree); | |
$input = new Bio::TreeIO(-fh => $treefh, -format => "nhx"); | |
eval { $new_tree = $input->next_tree }; | |
if ($@) | |
{ | |
print STDERR "ERROR: not able to parse tree $AC $TYPE.\n"; | |
print "ERROR: not able to parse tree $AC $TYPE.\n"; | |
return 0; | |
} | |
if ($new_tree eq '') | |
{ | |
print STDERR "ERROR: not able to parse tree $AC $TYPE.\n"; | |
print "ERROR: not able to parse tree $AC $TYPE.\n"; | |
return 0; | |
} | |
# GET THE ROOT NODE OF THE TREE: | |
$rootnode = $new_tree->get_root_node; | |
# GET ALL THE INTERNAL NODES IN THE TREE, AND RECORD THEIR PARENT NODES IN THE HASH | |
# TABLE %PARENT, AND THEIR DESCENDANTS IN THE HASH TABLE %SPLITS_INTO: | |
@nodes = $new_tree->get_nodes(); | |
foreach $node(@nodes) | |
{ | |
if ($node ne $rootnode) | |
{ | |
$parent = $node->ancestor; | |
# RECORD THAT NODE $parent IS THE PARENT NODE OF $node: | |
$PARENT{$node} = $parent; | |
# RECORD THAT $parent HAS $node AS A DESCENDANT: | |
if (!($SPLITS_INTO{$parent})) { $SPLITS_INTO{$parent} = $node; } | |
else { $SPLITS_INTO{$parent} = $SPLITS_INTO{$parent}.",".$node;} | |
# IF THIS IS A LEAF, CHECK WHETHER THIS IS AN OUTGROUP SEQUENCE: | |
if ($node->is_Leaf) | |
{ | |
$seq = $node->id(); | |
$SEQ{$node} = $seq; | |
} | |
} | |
} | |
# SEE IF WE WANT TO PRINT OUT DETAILS OF THE BRANCHING OF THE TREE: | |
if ($PRINT_FORK_DETAILS == 1) | |
{ | |
foreach $parent (keys %SPLITS_INTO) | |
{ | |
print "$parent splits into "; | |
$nodes = $SPLITS_INTO{$parent}; | |
@nodes = split(/\,/,$nodes); | |
for ($i = 0; $i <= $#nodes; $i++) | |
{ | |
$node = $nodes[$i]; | |
if ($SEQ{$node}) { print "$SEQ{$node}, ";} | |
else { print "$node, "; } | |
} | |
print "\n"; | |
} | |
} | |
return 1; | |
} | |
#------------------------------------------------------------------# | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment