Skip to content

Instantly share code, notes, and snippets.

@munky69rock
Last active August 29, 2015 14:14
Show Gist options
  • Select an option

  • Save munky69rock/84c26a3e0c3f8f2cc24d to your computer and use it in GitHub Desktop.

Select an option

Save munky69rock/84c26a3e0c3f8f2cc24d to your computer and use it in GitHub Desktop.
#!/usr/bin/env perl
use strict;
use warnings;
# perser for http://www.phylotree.org/
use Encode qw(encode decode);
use HTML::TreeBuilder;
use JSON::XS;
my $NO_BREAK_SPACE = chr(194).chr(160);
my $PATTERN = qr/
\(? # unstable
(?:
# ex: G1438A
[atgcATGC] # ancestry
\d+ # position
[atgcATGC] # descendant
|
# ex: C459d
[atgcATGC]?
\d+
d # deletion
|
# ex: 960.XC
[atgcATGC]?
\d+
\.[\dX]?[atgcATGC]* # insertion
d? # deletion
|
# ex: 59-60d
\d+\-\d+d # range
|
reserved
)
!* # reversion
\)?
/x;
my $BRANCH_NAME = '__BRANCH__';
my $SELF_NODE_NAME = '__SELF__';
sub is_branch_condition {
my $s = shift;
return $s =~ /\A$PATTERN(?:\s$PATTERN)*\z/;
}
sub trim_white_space {
my $s = shift;
$s =~ s/^\s+//g;
$s =~ s/\s+$//g;
$s =~ s/$NO_BREAK_SPACE*//g;
return $s;
}
sub remove_unnecessary_elements {
my $html = shift;
$html =~ s/ //g;
return $html;
}
sub get_utf8_file_contents {
my $file = shift;
open my $in, '<', $file or die $!;
my $html = "";
while (my $l = <$in>) {
$html .= $l;
}
# encode from Windows_1252 to UTF-8
return encode('utf8', decode('cp1252', $html));
}
sub get_deep_hash {
my ($hash, $array, $itr) = @_;
my $idx = shift @$itr;
my $key = $array->[$idx];
$hash->{$key} = {} unless exists $hash->{$key};
my $next_hash = $hash->{$key};
return $next_hash unless @$itr;
return get_deep_hash($next_hash, $array, $itr);
}
sub parse_phylotree_html {
my $html = shift;
my $parser = HTML::TreeBuilder->new_from_content($html);
my $skip_flg = 1;
my @tree = ();
my %tree = ();
my $prev_depth = 0;
for my $table ($parser->find("table")) {
for my $tr ($table->find("tr")) {
if ($tr->as_text =~ /mt\-MRCA/) {
$skip_flg = 0;
next;
}
next if $skip_flg;
my $len = 0;
my @col = ();
my $branch = "";
my @example_accessions = ();
my @conditions = ();
my @haplo_types = ();
my $depth = 0;
my $i = 0;
for my $td ($tr->find("td")) {
my $text = trim_white_space($td->as_text);
my $l = length $text;
$len += $l;
push @col, $text;
if ($l > 0) {
if (is_branch_condition($text)) {
@conditions = split / /, $text;
$depth = $i - 1;
} else {
push @haplo_types, $text;
}
}
$i++;
}
# skip blank rows
next unless $len;
my %example_accessions = ();
for my $idx (-2, -1) {
my $id = $col[$idx];
if ($id) {
push @example_accessions, $id;
$example_accessions{$id} = 1;
}
}
for my $id (@haplo_types) {
if ($branch && !exists $example_accessions{$id}) {
warn join("\t", @conditions, @haplo_types);
die 'something wrong';
}
if (!exists $example_accessions{$id}) {
if ($id =~ /\A$PATTERN\z/) {
warn $id;
}
$branch = $id;
}
}
$branch = $BRANCH_NAME unless $branch;
$tree[$depth] = $branch;
my $node = get_deep_hash(\%tree, \@tree, [0 .. $depth]);
$node->{$SELF_NODE_NAME} = +{
conditions => \@conditions,
example_accessions => \@example_accessions,
};
}
}
return \%tree;
}
sub prettify_tree {
my $tree = shift;
my $pretty_tree = +{};
my $self = $tree->{$SELF_NODE_NAME};
my $descendants = +{};
for my $key (grep { $_ ne $SELF_NODE_NAME } keys %$tree ) {
$descendants->{$key} = prettify_tree($tree->{$key});
}
$pretty_tree = +{
($self ? (conditions => $self->{conditions}) : ()),
($self ? (example_accessions => $self->{example_accessions} || []) : ()),
(keys %$descendants ? (descendants => $descendants)
: ()),
};
return $pretty_tree;
}
sub prettify_tree_to_array {
my $tree = shift;
my $pretty_tree = shift || [];
my $parent = shift || [];
my $descendants = $tree->{descendants};
for my $key (keys %$descendants) {
my $self = $descendants->{$key};
my $current = [@$parent, +{ name => $key, conditions => $self->{conditions} }];
push @$pretty_tree, $current;
if ($self->{descendants}) {
prettify_tree_to_array($self, $pretty_tree, $current);
}
}
return $pretty_tree;
}
sub main {
# mtDNA\ tree\ Build\ 16.htm
my $input_file = shift @ARGV or die "no file specified.";
die "$input_file not found." unless -e $input_file;
my $html = get_utf8_file_contents($input_file);
$html = remove_unnecessary_elements($html);
my $tree = parse_phylotree_html($html);
my $pretty_tree = prettify_tree($tree);
print JSON::XS->new->pretty(1)->encode($pretty_tree);
}
main();
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment