Skip to content

Instantly share code, notes, and snippets.

@wakaba
Last active December 31, 2015 05:39
Show Gist options
  • Save wakaba/7942028 to your computer and use it in GitHub Desktop.
Save wakaba/7942028 to your computer and use it in GitHub Desktop.
# XXX This is a very early prototype.
use strict;
use warnings;
use Path::Class;
use lib glob file (__FILE__)->dir->subdir('modules/*/lib');
use Web::DOM::Document;
use Algorithm::Diff qw(sdiff);
sub HTML_NS () { q<http://www.w3.org/1999/xhtml> }
my $ItemElements = {
(HTML_NS) => {li => 1, dt => 1, dd => 1, figcaption => 1, summary => 1,
legend => 1, caption => 1, td => 1, th => 1,
rt => 1, rp => 1, body => 1},
};
my $EditIgnoredElements = {
(HTML_NS) => {tbody => 1, thead => 1, tfoot => 1, tr => 1, html => 1},
};
my $NewOnlyElements = {
(HTML_NS) => {colgroup => 1, col => 1, head => 1},
};
my $AttrIgnoredElements = {
(HTML_NS) => {html => 1, body => 1},
};
my $AttrContentIgnoredElements = {
(HTML_NS) => {head => 1},
};
my $IgnoreWSPElements = {
(HTML_NS) => {html => 1, head => 1, frameset => 1,
ul => 1, ol => 1, dl => 1, select => 1, datalist => 1,
optgroup => 1, table => 1, tbody => 1, thead => 1, tfoot => 1,
colgroup => 1},
};
my $OpaqueElements = {
(HTML_NS) => {head => 1, frameset => 1, script => 1, noscript => 1,
noframes => 1, noembed => 1, xmp => 1, plaintext => 1,
iframe => 1, object => 1, applet => 1, video => 1, audio => 1,
style => 1, select => 1, textarea => 1, embed => 1,
br => 1, img => 1, wbr => 1, menuitem => 1, hr => 1,
param => 1, source => 1, track => 1, area => 1,
dfn => 1, abbr => 1, data => 1, time => 1,
input => 1, button => 1, option => 1, optgroup => 1,
datalist => 1, keygen => 1, progress => 1, meter => 1,
template => 1, bgsound => 1, frame => 1},
# <svg>, <math>
};
sub html ($) {
my $doc = new Web::DOM::Document;
$doc->manakai_is_html (1);
$doc->inner_html ($_[0]);
return $doc;
}
sub nodes ($);
sub nodes ($) {
my $root = shift;
my $no_wsp = $IgnoreWSPElements->{$root->namespace_uri // ''}->{$root->local_name // ''};
return ($root, grep { not $no_wsp or not ($_->node_type == 3 and $_->data =~ /^[\x09\x0A\x0D\x0C\x20]*$/) } grep { $_->node_type != $_->DOCUMENT_TYPE_NODE } map { $OpaqueElements->{$_->namespace_uri // ''}->{$_->local_name // ''} ? ($_) : (nodes $_) } @{$root->child_nodes});
}
my $doc1 = html q{<p>hoge<em>hoge</em><p>aa<br><div>x<ul><li>a<li>b<li>d<li>e<li>f</li>g<li>h</ul><ruby>aa<rp>b</ruby>
<table><colgroup><tr><th>aa<td>bb<tfoot><tr><td>dd</table>};
my $doc2 = html q{<p>fuga<span>hoge</span>aa<br><div>x<ul><li>b<li>c</ul><ruby>aa<rt>bb</ruby><table><col><tr><th>aa<td>bb<td>cc><tr><td>dd};
$doc1 = html q{<p>hoge<script>foo</script><p class=x><img>};
$doc2 = html q{<title></title><p>hoge<script>bar</script><p class=y><img>};
use Encode;
$doc1 = html (decode 'utf-8', scalar file (shift)->slurp);
$doc2 = html (decode 'utf-8', scalar file (shift)->slurp);
warn $doc1->inner_html;
warn $doc2->inner_html;
my $nodes1 = [nodes $doc1];
my $nodes2 = [nodes $doc2];
my @diff = sdiff $nodes1, $nodes2, sub {
my $v = shift;
if ($v->node_type == $v->TEXT_NODE) {
return join $;, $v->node_type, $v->text_content;
} elsif ($v->node_type == $v->ELEMENT_NODE) {
if ($AttrContentIgnoredElements->{$v->namespace_uri // ''}->{$v->local_name}) {
return join $;, $v->node_type, $v->namespace_uri // '', $v->local_name;
} elsif ($AttrIgnoredElements->{$v->namespace_uri // ''}->{$v->local_name}) {
return join $;, $v->node_type, $v->namespace_uri // '', $v->local_name;
} elsif ($OpaqueElements->{$v->namespace_uri // ''}->{$v->local_name}) {
return join $;, $v->node_type, $v->namespace_uri // '', $v->local_name, (sort { $a cmp $b } map { join $;, $_->namespace_uri // '', $_->local_name, $_->value } @{$v->attributes}), $v->inner_html; # XXX attr order
} else {
return join $;, $v->node_type, $v->namespace_uri // '', $v->local_name, sort { $a cmp $b } map { join $;, $_->namespace_uri // '', $_->local_name, $_->value } @{$v->attributes};
}
} elsif ($v->node_type == $v->DOCUMENT_NODE) {
return join $;, $v->node_type, $v->node_name;
} else {
return join $;, $v->node_type, $v->node_name, $v->text_content;
}
};
sub shownode ($) {
my $v = shift;
return '' unless $v;
if ($v->node_type == $v->TEXT_NODE) {
return join ':', $v->node_name, $v->text_content;
} elsif ($v->node_type == $v->ELEMENT_NODE) {
return join ':', $v->node_name;
} elsif ($v->node_type == $v->DOCUMENT_NODE) {
return join ':', $v->node_name;
} else {
return join ':', $v->node_type, $v->node_name, $v->text_content;
}
}
sub clone ($) {
if ($OpaqueElements->{$_[0]->namespace_uri // ''}->{$_[0]->local_name // ''}) {
return $_[0]->clone_node (1);
} else {
return $_[0]->clone_node (0);
}
}
my $IsEqual = {};
$IsEqual->{'', ''} = 1;
for my $v (@diff) {
$IsEqual->{$v->[1], $v->[2]} = 1 if $v->[0] eq 'u';
warn join "\t", (($v->[0] eq 'u' and not $IsEqual->{$v->[1]->parent_node // '', $v->[2]->parent_node // ''}) ? 'm' : $v->[0]),
shownode $v->[1],
shownode $v->[2],
"\n";
}
use Scalar::Util qw(refaddr);
sub create_diff_doc ($) {
my @diff = @{$_[0]};
my $doc = new Web::DOM::Document;
$doc->manakai_is_html (1);
my $df = $doc->create_document_fragment;
my $to_new = {ins => {}, del => {}};
my $is_equal = {};
return $df unless @diff;
die unless $diff[0]->[0] eq 'c' or $diff[0]->[0] eq 'u';
$is_equal->{refaddr $diff[0]->[1]->parent_node // '',
refaddr $diff[0]->[2]->parent_node // ''} = 1;
if ($diff[0]->[1]->node_type == $diff[0]->[1]->DOCUMENT_NODE) {
if ($diff[0]->[2]->node_type == $diff[0]->[2]->DOCUMENT_NODE) {
$to_new->{ins}->{refaddr $diff[0]->[2]} = $df;
$is_equal->{refaddr $diff[0]->[1], refaddr $diff[0]->[2]} = 1;
shift @diff;
} else {
$to_new->{del}->{refaddr $diff[0]->[1]} = $df;
$to_new->{ins}->{refaddr $diff[0]->[2]->parent_node // ''} = $df;
$is_equal->{refaddr $diff[0]->[1],
refaddr $diff[0]->[2]->parent_node // ''} = 1;
$diff[0] = ['-', $diff[0]->[1], ''];
}
} elsif ($diff[0]->[2]->node_type == $diff[0]->[2]->DOCUMENT_NODE) {
$to_new->{del}->{refaddr $diff[0]->[1]->parent_node // ''} = $df;
$to_new->{ins}->{refaddr $diff[0]->[2]} = $df;
$is_equal->{refaddr $diff[0]->[1]->parent_node // '',
refaddr $diff[0]->[2]} = 1;
$diff[0] = ['+', '', $diff[0]->[2]];
} else {
$to_new->{del}->{refaddr $diff[0]->[1]->parent_node // ''} = $df;
$to_new->{ins}->{refaddr $diff[0]->[2]->parent_node // ''} = $df;
}
my $in_edit = {ins => {}, del => {}};
my $edit = {};
my $need_edit_in_children = {};
my @new; ## keep elements to preserve |refaddr| of them
my $insert_edit = sub {
my ($item, $type, $new_parent) = @_;
if ($item->node_type == $item->ELEMENT_NODE and
($EditIgnoredElements->{$item->namespace_uri // ''}->{$item->local_name} or
($NewOnlyElements->{$item->namespace_uri // ''}->{$item->local_name} and $type eq 'ins'))) {
my $new = clone $item;
push @new, $new;
$to_new->{$type}->{refaddr $item} = $new;
$new_parent->append_child($new);
$need_edit_in_children->{refaddr $item} = $type;
} elsif ($item->node_type == $item->ELEMENT_NODE and
$NewOnlyElements->{$item->namespace_uri // ''}->{$item->local_name}) {
#
} elsif ($item->node_type == $item->ELEMENT_NODE and
$ItemElements->{$item->namespace_uri // ''}->{$item->local_name}) {
my $new = clone $item;
push @new, $new;
$edit->{$type} = $doc->create_element ($type);
$to_new->{$type}->{refaddr $item} = $edit->{$type};
$new->append_child ($edit->{$type});
$new_parent->append_child($new);
$in_edit->{$type}->{refaddr $edit->{$type}} = 1;
push @new, $edit->{$type};
} else {
my $new = clone $item;
push @new, $new;
$to_new->{$type}->{refaddr $item} = $new;
unless ($in_edit->{$type}->{refaddr $new_parent}) {
if ($edit->{$type} and $new_parent->last_child and
$edit->{$type} eq $new_parent->last_child) {
$new_parent = $edit->{$type};
} else {
$new_parent = $edit->{$type} = $new_parent->append_child
($doc->create_element ($type));
}
}
$new_parent->append_child ($new);
$in_edit->{$type}->{refaddr $new} = 1;
}
}; # $insert_edit
for my $v (@diff) {
my $op = $v->[0];
if ($op eq 'u') {
$is_equal->{refaddr $v->[1], refaddr $v->[2]} = 1;
$op = 'm' unless
$is_equal->{refaddr ($v->[1]->parent_node) // '',
refaddr ($v->[2]->parent_node) // ''};
}
if ($op eq 'u') {
{
my $new_parent = $to_new->{del}->{refaddr $v->[1]->parent_node // ''}
or last;
last if $new_parent eq $to_new->{ins}->{refaddr $v->[2]->parent_node // ''};
if ($need_edit_in_children->{refaddr $v->[1]->parent_node // ''}) {
$insert_edit->($v->[1], 'del', $new_parent);
} else {
my $new = clone $v->[1];
$to_new->{del}->{refaddr $v->[1]} = $new;
$new_parent->append_child ($new);
}
}
{
my $new_parent = $to_new->{ins}->{refaddr $v->[2]->parent_node // ''} or die;
if ($need_edit_in_children->{refaddr $v->[2]->parent_node // ''}) {
$insert_edit->($v->[2], 'ins', $new_parent);
} else {
my $new = clone $v->[2];
$to_new->{ins}->{refaddr $v->[2]} = $new;
$new_parent->append_child ($new);
$to_new->{del}->{refaddr $v->[1]} = $to_new->{ins}->{refaddr $v->[2]};
}
}
}
if ($op eq '-' or $op eq 'm' or $op eq 'c') {
my $new_parent = $to_new->{del}->{refaddr $v->[1]->parent_node // ''};
$new_parent = $to_new->{ins}->{refaddr $v->[2]->parent_node // ''}
if not $new_parent and $v->[2];
$insert_edit->($v->[1], 'del', $new_parent);
}
if ($op eq '+' or $op eq 'm' or $op eq 'c') {
my $new_parent = $to_new->{ins}->{refaddr $v->[2]->parent_node // ''};
$insert_edit->($v->[2], 'ins', $new_parent);
}
}
return $df;
}
# XXX space in table, ul
# XXX hgroup
# XXX <del><dl><dt><del>
my $doc3 = create_diff_doc \@diff;
print $doc3->inner_html;
## NOTES:
## id="" duplication
## map
## script, style form
## head colgroup col
## dfn duplication
## menu
## details, fieldset, figure content model
## doctype
## URLs
## comments, pis
## This library is free software; you can redistribute it and/or modify
## it under the same terms as Perl itself.
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment