Last active
December 31, 2015 05:39
-
-
Save wakaba/7942028 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# XXX This is a very early prototype. | |
use strict; | |
use warnings; | |
use Path::Class; | |
use lib glob file (__FILE__)->dir->subdir('modules/*/lib'); | |
use Web::DOM::Document; | |
use Algorithm::Diff qw(sdiff); | |
sub HTML_NS () { q<http://www.w3.org/1999/xhtml> } | |
my $ItemElements = { | |
(HTML_NS) => {li => 1, dt => 1, dd => 1, figcaption => 1, summary => 1, | |
legend => 1, caption => 1, td => 1, th => 1, | |
rt => 1, rp => 1, body => 1}, | |
}; | |
my $EditIgnoredElements = { | |
(HTML_NS) => {tbody => 1, thead => 1, tfoot => 1, tr => 1, html => 1}, | |
}; | |
my $NewOnlyElements = { | |
(HTML_NS) => {colgroup => 1, col => 1, head => 1}, | |
}; | |
my $AttrIgnoredElements = { | |
(HTML_NS) => {html => 1, body => 1}, | |
}; | |
my $AttrContentIgnoredElements = { | |
(HTML_NS) => {head => 1}, | |
}; | |
my $IgnoreWSPElements = { | |
(HTML_NS) => {html => 1, head => 1, frameset => 1, | |
ul => 1, ol => 1, dl => 1, select => 1, datalist => 1, | |
optgroup => 1, table => 1, tbody => 1, thead => 1, tfoot => 1, | |
colgroup => 1}, | |
}; | |
my $OpaqueElements = { | |
(HTML_NS) => {head => 1, frameset => 1, script => 1, noscript => 1, | |
noframes => 1, noembed => 1, xmp => 1, plaintext => 1, | |
iframe => 1, object => 1, applet => 1, video => 1, audio => 1, | |
style => 1, select => 1, textarea => 1, embed => 1, | |
br => 1, img => 1, wbr => 1, menuitem => 1, hr => 1, | |
param => 1, source => 1, track => 1, area => 1, | |
dfn => 1, abbr => 1, data => 1, time => 1, | |
input => 1, button => 1, option => 1, optgroup => 1, | |
datalist => 1, keygen => 1, progress => 1, meter => 1, | |
template => 1, bgsound => 1, frame => 1}, | |
# <svg>, <math> | |
}; | |
sub html ($) { | |
my $doc = new Web::DOM::Document; | |
$doc->manakai_is_html (1); | |
$doc->inner_html ($_[0]); | |
return $doc; | |
} | |
sub nodes ($); | |
sub nodes ($) { | |
my $root = shift; | |
my $no_wsp = $IgnoreWSPElements->{$root->namespace_uri // ''}->{$root->local_name // ''}; | |
return ($root, grep { not $no_wsp or not ($_->node_type == 3 and $_->data =~ /^[\x09\x0A\x0D\x0C\x20]*$/) } grep { $_->node_type != $_->DOCUMENT_TYPE_NODE } map { $OpaqueElements->{$_->namespace_uri // ''}->{$_->local_name // ''} ? ($_) : (nodes $_) } @{$root->child_nodes}); | |
} | |
my $doc1 = html q{<p>hoge<em>hoge</em><p>aa<br><div>x<ul><li>a<li>b<li>d<li>e<li>f</li>g<li>h</ul><ruby>aa<rp>b</ruby> | |
<table><colgroup><tr><th>aa<td>bb<tfoot><tr><td>dd</table>}; | |
my $doc2 = html q{<p>fuga<span>hoge</span>aa<br><div>x<ul><li>b<li>c</ul><ruby>aa<rt>bb</ruby><table><col><tr><th>aa<td>bb<td>cc><tr><td>dd}; | |
$doc1 = html q{<p>hoge<script>foo</script><p class=x><img>}; | |
$doc2 = html q{<title></title><p>hoge<script>bar</script><p class=y><img>}; | |
use Encode; | |
$doc1 = html (decode 'utf-8', scalar file (shift)->slurp); | |
$doc2 = html (decode 'utf-8', scalar file (shift)->slurp); | |
warn $doc1->inner_html; | |
warn $doc2->inner_html; | |
my $nodes1 = [nodes $doc1]; | |
my $nodes2 = [nodes $doc2]; | |
my @diff = sdiff $nodes1, $nodes2, sub { | |
my $v = shift; | |
if ($v->node_type == $v->TEXT_NODE) { | |
return join $;, $v->node_type, $v->text_content; | |
} elsif ($v->node_type == $v->ELEMENT_NODE) { | |
if ($AttrContentIgnoredElements->{$v->namespace_uri // ''}->{$v->local_name}) { | |
return join $;, $v->node_type, $v->namespace_uri // '', $v->local_name; | |
} elsif ($AttrIgnoredElements->{$v->namespace_uri // ''}->{$v->local_name}) { | |
return join $;, $v->node_type, $v->namespace_uri // '', $v->local_name; | |
} elsif ($OpaqueElements->{$v->namespace_uri // ''}->{$v->local_name}) { | |
return join $;, $v->node_type, $v->namespace_uri // '', $v->local_name, (sort { $a cmp $b } map { join $;, $_->namespace_uri // '', $_->local_name, $_->value } @{$v->attributes}), $v->inner_html; # XXX attr order | |
} else { | |
return join $;, $v->node_type, $v->namespace_uri // '', $v->local_name, sort { $a cmp $b } map { join $;, $_->namespace_uri // '', $_->local_name, $_->value } @{$v->attributes}; | |
} | |
} elsif ($v->node_type == $v->DOCUMENT_NODE) { | |
return join $;, $v->node_type, $v->node_name; | |
} else { | |
return join $;, $v->node_type, $v->node_name, $v->text_content; | |
} | |
}; | |
sub shownode ($) { | |
my $v = shift; | |
return '' unless $v; | |
if ($v->node_type == $v->TEXT_NODE) { | |
return join ':', $v->node_name, $v->text_content; | |
} elsif ($v->node_type == $v->ELEMENT_NODE) { | |
return join ':', $v->node_name; | |
} elsif ($v->node_type == $v->DOCUMENT_NODE) { | |
return join ':', $v->node_name; | |
} else { | |
return join ':', $v->node_type, $v->node_name, $v->text_content; | |
} | |
} | |
sub clone ($) { | |
if ($OpaqueElements->{$_[0]->namespace_uri // ''}->{$_[0]->local_name // ''}) { | |
return $_[0]->clone_node (1); | |
} else { | |
return $_[0]->clone_node (0); | |
} | |
} | |
my $IsEqual = {}; | |
$IsEqual->{'', ''} = 1; | |
for my $v (@diff) { | |
$IsEqual->{$v->[1], $v->[2]} = 1 if $v->[0] eq 'u'; | |
warn join "\t", (($v->[0] eq 'u' and not $IsEqual->{$v->[1]->parent_node // '', $v->[2]->parent_node // ''}) ? 'm' : $v->[0]), | |
shownode $v->[1], | |
shownode $v->[2], | |
"\n"; | |
} | |
use Scalar::Util qw(refaddr); | |
sub create_diff_doc ($) { | |
my @diff = @{$_[0]}; | |
my $doc = new Web::DOM::Document; | |
$doc->manakai_is_html (1); | |
my $df = $doc->create_document_fragment; | |
my $to_new = {ins => {}, del => {}}; | |
my $is_equal = {}; | |
return $df unless @diff; | |
die unless $diff[0]->[0] eq 'c' or $diff[0]->[0] eq 'u'; | |
$is_equal->{refaddr $diff[0]->[1]->parent_node // '', | |
refaddr $diff[0]->[2]->parent_node // ''} = 1; | |
if ($diff[0]->[1]->node_type == $diff[0]->[1]->DOCUMENT_NODE) { | |
if ($diff[0]->[2]->node_type == $diff[0]->[2]->DOCUMENT_NODE) { | |
$to_new->{ins}->{refaddr $diff[0]->[2]} = $df; | |
$is_equal->{refaddr $diff[0]->[1], refaddr $diff[0]->[2]} = 1; | |
shift @diff; | |
} else { | |
$to_new->{del}->{refaddr $diff[0]->[1]} = $df; | |
$to_new->{ins}->{refaddr $diff[0]->[2]->parent_node // ''} = $df; | |
$is_equal->{refaddr $diff[0]->[1], | |
refaddr $diff[0]->[2]->parent_node // ''} = 1; | |
$diff[0] = ['-', $diff[0]->[1], '']; | |
} | |
} elsif ($diff[0]->[2]->node_type == $diff[0]->[2]->DOCUMENT_NODE) { | |
$to_new->{del}->{refaddr $diff[0]->[1]->parent_node // ''} = $df; | |
$to_new->{ins}->{refaddr $diff[0]->[2]} = $df; | |
$is_equal->{refaddr $diff[0]->[1]->parent_node // '', | |
refaddr $diff[0]->[2]} = 1; | |
$diff[0] = ['+', '', $diff[0]->[2]]; | |
} else { | |
$to_new->{del}->{refaddr $diff[0]->[1]->parent_node // ''} = $df; | |
$to_new->{ins}->{refaddr $diff[0]->[2]->parent_node // ''} = $df; | |
} | |
my $in_edit = {ins => {}, del => {}}; | |
my $edit = {}; | |
my $need_edit_in_children = {}; | |
my @new; ## keep elements to preserve |refaddr| of them | |
my $insert_edit = sub { | |
my ($item, $type, $new_parent) = @_; | |
if ($item->node_type == $item->ELEMENT_NODE and | |
($EditIgnoredElements->{$item->namespace_uri // ''}->{$item->local_name} or | |
($NewOnlyElements->{$item->namespace_uri // ''}->{$item->local_name} and $type eq 'ins'))) { | |
my $new = clone $item; | |
push @new, $new; | |
$to_new->{$type}->{refaddr $item} = $new; | |
$new_parent->append_child($new); | |
$need_edit_in_children->{refaddr $item} = $type; | |
} elsif ($item->node_type == $item->ELEMENT_NODE and | |
$NewOnlyElements->{$item->namespace_uri // ''}->{$item->local_name}) { | |
# | |
} elsif ($item->node_type == $item->ELEMENT_NODE and | |
$ItemElements->{$item->namespace_uri // ''}->{$item->local_name}) { | |
my $new = clone $item; | |
push @new, $new; | |
$edit->{$type} = $doc->create_element ($type); | |
$to_new->{$type}->{refaddr $item} = $edit->{$type}; | |
$new->append_child ($edit->{$type}); | |
$new_parent->append_child($new); | |
$in_edit->{$type}->{refaddr $edit->{$type}} = 1; | |
push @new, $edit->{$type}; | |
} else { | |
my $new = clone $item; | |
push @new, $new; | |
$to_new->{$type}->{refaddr $item} = $new; | |
unless ($in_edit->{$type}->{refaddr $new_parent}) { | |
if ($edit->{$type} and $new_parent->last_child and | |
$edit->{$type} eq $new_parent->last_child) { | |
$new_parent = $edit->{$type}; | |
} else { | |
$new_parent = $edit->{$type} = $new_parent->append_child | |
($doc->create_element ($type)); | |
} | |
} | |
$new_parent->append_child ($new); | |
$in_edit->{$type}->{refaddr $new} = 1; | |
} | |
}; # $insert_edit | |
for my $v (@diff) { | |
my $op = $v->[0]; | |
if ($op eq 'u') { | |
$is_equal->{refaddr $v->[1], refaddr $v->[2]} = 1; | |
$op = 'm' unless | |
$is_equal->{refaddr ($v->[1]->parent_node) // '', | |
refaddr ($v->[2]->parent_node) // ''}; | |
} | |
if ($op eq 'u') { | |
{ | |
my $new_parent = $to_new->{del}->{refaddr $v->[1]->parent_node // ''} | |
or last; | |
last if $new_parent eq $to_new->{ins}->{refaddr $v->[2]->parent_node // ''}; | |
if ($need_edit_in_children->{refaddr $v->[1]->parent_node // ''}) { | |
$insert_edit->($v->[1], 'del', $new_parent); | |
} else { | |
my $new = clone $v->[1]; | |
$to_new->{del}->{refaddr $v->[1]} = $new; | |
$new_parent->append_child ($new); | |
} | |
} | |
{ | |
my $new_parent = $to_new->{ins}->{refaddr $v->[2]->parent_node // ''} or die; | |
if ($need_edit_in_children->{refaddr $v->[2]->parent_node // ''}) { | |
$insert_edit->($v->[2], 'ins', $new_parent); | |
} else { | |
my $new = clone $v->[2]; | |
$to_new->{ins}->{refaddr $v->[2]} = $new; | |
$new_parent->append_child ($new); | |
$to_new->{del}->{refaddr $v->[1]} = $to_new->{ins}->{refaddr $v->[2]}; | |
} | |
} | |
} | |
if ($op eq '-' or $op eq 'm' or $op eq 'c') { | |
my $new_parent = $to_new->{del}->{refaddr $v->[1]->parent_node // ''}; | |
$new_parent = $to_new->{ins}->{refaddr $v->[2]->parent_node // ''} | |
if not $new_parent and $v->[2]; | |
$insert_edit->($v->[1], 'del', $new_parent); | |
} | |
if ($op eq '+' or $op eq 'm' or $op eq 'c') { | |
my $new_parent = $to_new->{ins}->{refaddr $v->[2]->parent_node // ''}; | |
$insert_edit->($v->[2], 'ins', $new_parent); | |
} | |
} | |
return $df; | |
} | |
# XXX space in table, ul | |
# XXX hgroup | |
# XXX <del><dl><dt><del> | |
my $doc3 = create_diff_doc \@diff; | |
print $doc3->inner_html; | |
## NOTES: | |
## id="" duplication | |
## map | |
## script, style form | |
## head colgroup col | |
## dfn duplication | |
## menu | |
## details, fieldset, figure content model | |
## doctype | |
## URLs | |
## comments, pis | |
## This library is free software; you can redistribute it and/or modify | |
## it under the same terms as Perl itself. |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment