Last active
August 29, 2015 14:14
-
-
Save munky69rock/84c26a3e0c3f8f2cc24d to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env perl | |
| use strict; | |
| use warnings; | |
| # perser for http://www.phylotree.org/ | |
| use Encode qw(encode decode); | |
| use HTML::TreeBuilder; | |
| use JSON::XS; | |
| my $NO_BREAK_SPACE = chr(194).chr(160); | |
| my $PATTERN = qr/ | |
| \(? # unstable | |
| (?: | |
| # ex: G1438A | |
| [atgcATGC] # ancestry | |
| \d+ # position | |
| [atgcATGC] # descendant | |
| | | |
| # ex: C459d | |
| [atgcATGC]? | |
| \d+ | |
| d # deletion | |
| | | |
| # ex: 960.XC | |
| [atgcATGC]? | |
| \d+ | |
| \.[\dX]?[atgcATGC]* # insertion | |
| d? # deletion | |
| | | |
| # ex: 59-60d | |
| \d+\-\d+d # range | |
| | | |
| reserved | |
| ) | |
| !* # reversion | |
| \)? | |
| /x; | |
| my $BRANCH_NAME = '__BRANCH__'; | |
| my $SELF_NODE_NAME = '__SELF__'; | |
| sub is_branch_condition { | |
| my $s = shift; | |
| return $s =~ /\A$PATTERN(?:\s$PATTERN)*\z/; | |
| } | |
| sub trim_white_space { | |
| my $s = shift; | |
| $s =~ s/^\s+//g; | |
| $s =~ s/\s+$//g; | |
| $s =~ s/$NO_BREAK_SPACE*//g; | |
| return $s; | |
| } | |
| sub remove_unnecessary_elements { | |
| my $html = shift; | |
| $html =~ s/ //g; | |
| return $html; | |
| } | |
| sub get_utf8_file_contents { | |
| my $file = shift; | |
| open my $in, '<', $file or die $!; | |
| my $html = ""; | |
| while (my $l = <$in>) { | |
| $html .= $l; | |
| } | |
| # encode from Windows_1252 to UTF-8 | |
| return encode('utf8', decode('cp1252', $html)); | |
| } | |
| sub get_deep_hash { | |
| my ($hash, $array, $itr) = @_; | |
| my $idx = shift @$itr; | |
| my $key = $array->[$idx]; | |
| $hash->{$key} = {} unless exists $hash->{$key}; | |
| my $next_hash = $hash->{$key}; | |
| return $next_hash unless @$itr; | |
| return get_deep_hash($next_hash, $array, $itr); | |
| } | |
| sub parse_phylotree_html { | |
| my $html = shift; | |
| my $parser = HTML::TreeBuilder->new_from_content($html); | |
| my $skip_flg = 1; | |
| my @tree = (); | |
| my %tree = (); | |
| my $prev_depth = 0; | |
| for my $table ($parser->find("table")) { | |
| for my $tr ($table->find("tr")) { | |
| if ($tr->as_text =~ /mt\-MRCA/) { | |
| $skip_flg = 0; | |
| next; | |
| } | |
| next if $skip_flg; | |
| my $len = 0; | |
| my @col = (); | |
| my $branch = ""; | |
| my @example_accessions = (); | |
| my @conditions = (); | |
| my @haplo_types = (); | |
| my $depth = 0; | |
| my $i = 0; | |
| for my $td ($tr->find("td")) { | |
| my $text = trim_white_space($td->as_text); | |
| my $l = length $text; | |
| $len += $l; | |
| push @col, $text; | |
| if ($l > 0) { | |
| if (is_branch_condition($text)) { | |
| @conditions = split / /, $text; | |
| $depth = $i - 1; | |
| } else { | |
| push @haplo_types, $text; | |
| } | |
| } | |
| $i++; | |
| } | |
| # skip blank rows | |
| next unless $len; | |
| my %example_accessions = (); | |
| for my $idx (-2, -1) { | |
| my $id = $col[$idx]; | |
| if ($id) { | |
| push @example_accessions, $id; | |
| $example_accessions{$id} = 1; | |
| } | |
| } | |
| for my $id (@haplo_types) { | |
| if ($branch && !exists $example_accessions{$id}) { | |
| warn join("\t", @conditions, @haplo_types); | |
| die 'something wrong'; | |
| } | |
| if (!exists $example_accessions{$id}) { | |
| if ($id =~ /\A$PATTERN\z/) { | |
| warn $id; | |
| } | |
| $branch = $id; | |
| } | |
| } | |
| $branch = $BRANCH_NAME unless $branch; | |
| $tree[$depth] = $branch; | |
| my $node = get_deep_hash(\%tree, \@tree, [0 .. $depth]); | |
| $node->{$SELF_NODE_NAME} = +{ | |
| conditions => \@conditions, | |
| example_accessions => \@example_accessions, | |
| }; | |
| } | |
| } | |
| return \%tree; | |
| } | |
| sub prettify_tree { | |
| my $tree = shift; | |
| my $pretty_tree = +{}; | |
| my $self = $tree->{$SELF_NODE_NAME}; | |
| my $descendants = +{}; | |
| for my $key (grep { $_ ne $SELF_NODE_NAME } keys %$tree ) { | |
| $descendants->{$key} = prettify_tree($tree->{$key}); | |
| } | |
| $pretty_tree = +{ | |
| ($self ? (conditions => $self->{conditions}) : ()), | |
| ($self ? (example_accessions => $self->{example_accessions} || []) : ()), | |
| (keys %$descendants ? (descendants => $descendants) | |
| : ()), | |
| }; | |
| return $pretty_tree; | |
| } | |
| sub prettify_tree_to_array { | |
| my $tree = shift; | |
| my $pretty_tree = shift || []; | |
| my $parent = shift || []; | |
| my $descendants = $tree->{descendants}; | |
| for my $key (keys %$descendants) { | |
| my $self = $descendants->{$key}; | |
| my $current = [@$parent, +{ name => $key, conditions => $self->{conditions} }]; | |
| push @$pretty_tree, $current; | |
| if ($self->{descendants}) { | |
| prettify_tree_to_array($self, $pretty_tree, $current); | |
| } | |
| } | |
| return $pretty_tree; | |
| } | |
| sub main { | |
| # mtDNA\ tree\ Build\ 16.htm | |
| my $input_file = shift @ARGV or die "no file specified."; | |
| die "$input_file not found." unless -e $input_file; | |
| my $html = get_utf8_file_contents($input_file); | |
| $html = remove_unnecessary_elements($html); | |
| my $tree = parse_phylotree_html($html); | |
| my $pretty_tree = prettify_tree($tree); | |
| print JSON::XS->new->pretty(1)->encode($pretty_tree); | |
| } | |
| main(); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment