Skip to content

Instantly share code, notes, and snippets.

@basinilya
Last active June 28, 2018 10:59
Show Gist options
  • Select an option

  • Save basinilya/1885ef6dc2bbaedeea902ac6da5c6dd5 to your computer and use it in GitHub Desktop.

Select an option

Save basinilya/1885ef6dc2bbaedeea902ac6da5c6dd5 to your computer and use it in GitHub Desktop.
adapt html for use in pod =for html paragraph
#!/usr/bin/perl
use strict;
use warnings;
use 5.008;
use Getopt::Long;
use URI;
use HTML::TreeBuilder;
my $base_uri;
my $wiki_uri;
sub mkcomment
{
my $comment = shift;
[ "~comment" => { text => $comment } ];
}
sub process_node
{
my $elt = shift;
return if $elt->is_empty;
$elt->normalize_content; # Make sure text is contiguous
my $content = $elt->content_array_ref;
#print STDERR "#children: ", 0 + @$content, "\n";
# Iterate in reverse order, because we insert elements
for (my $i = @$content - 1; $i >= 0; --$i) {
#print STDERR "child: ", $i, "\n";
if (ref $content->[$i]) {
my $child = $content->[$i];
if ($child->tag eq "~comment") {
my $comment = $child->attr("text");
if ($comment =~ s/\n\s*\n/\n/g) {
$elt->splice_content(
$i, 1,
mkcomment($comment)
);
}
} else {
# It's a child element, process it recursively:
process_node($child);
if (defined $wiki_uri && $child->tag eq 'a') {
#print STDERR $uri, "\n";
my $href = $child->attr("href");
if (defined $href && !($href =~ /^[#]/)) {
my $uri = URI->new_abs( $href, $base_uri );
if ($uri ne $href) {
$uri = $uri->rel($wiki_uri);
$uri =~ s/[.]html($|[#?])/$1/;
$uri =~ s/[\/]/./g;
$uri = URI->new_abs( $uri, $wiki_uri );
$child->attr("href", $uri);
#print STDERR "href='", $uri, "'\n";
}
}
#$base_uri
}
}
} else {
# It's text:
my $text = $content->[$i];
#my $tex2 = $text;
#$tex2 =~ s/\n/|/g;
# Split the text node using "blank line" as a delimiter,
# but keep the matched delimiter as an item prefix:
# whitespaces have to be preserved within <pre>.
# Then insert a comment hash object between the items.
# Also prefix lines that start with '=' to not confuse Pod
my @pieces;
my $pos1 = 0;
# Use lookahead to find overlapping matches and keep the delimiter.
# [^\S\n] matches any whitespace except newline
while ($text =~ /\n(?=[=]|[^\S\n]*\n)/g) {
#print STDERR "", $tex2, "\n", " " x $-[0], "^\n", " " x $+[0], "^\n";
my $piece = substr($text, $pos1, $+[0] - $pos1);
$pos1 = $+[0];
#my $piec2 = $piece;
#$piec2 =~ s/\n/|/g;
#print STDERR "'", $piec2, "'\n";
push(@pieces, $piece);
push(@pieces, mkcomment(" not-end-html "));
}
#print STDERR "done\n";
if (@pieces > 0) {
$elt->splice_content(
$i, 1, # Replace this text element with...
@pieces, substr($text, $pos1)
);
}
} # end else text
} # end for $i in content index
} # end process_node
sub getopts {
GetOptions ("base-uri=s" => \$base_uri,
"wiki-uri=s" => \$wiki_uri
)
or die("Error in command line arguments\n");
}
getopts;
$base_uri //= $wiki_uri;
# \\/
my $content;
{
local *FH;
open FH, "$ARGV[0]" or die $!;
-f FH and sysread FH, $content, -s FH;
}
# Wrap the SHTML fragment so the comments don't move:
my $html = HTML::TreeBuilder->new;
$html->store_comments(1);
# cannot use no_space_compacting, because closing </li> is stripped, possibly leaving a blank line
#$html->no_space_compacting(1);
$html->parse("<html><body>$content</body></html>");
my $body = $html->look_down(qw(_tag body));
process_node($body);
# Now strip the wrapper to get the SHTML fragment back:
$content = $body->as_HTML;
$content =~ s!^<body>\n?!!;
$content =~ s!</body>\s*\z!!;
print STDOUT "=pod\n\n=encoding UTF-8\n\n=for html\n";
print STDOUT $content; # Replace STDOUT with a suitable filehandle
print STDOUT "\n";
print STDOUT "=cut\n";
#!/bin/bash
if [ ! -f overview-summary.html ]; then
>&2 echo not found overview-summary.html
exit 1
fi
find . -name "*.html" | while read p; do
baseuri=${p%/*}
baseuri=${wikiuri:?}${baseuri#.}/
p=${p#./}
outfile=~/prod-watermark.wiki/${p//\//.}
outfile=${outfile%.html}.pod
~/html4pod.pl --base-uri ${baseuri:?} --wiki-uri ${wikiuri:?}/ ${p} >${outfile:?}
done
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment