Last active
June 28, 2018 10:59
-
-
Save basinilya/1885ef6dc2bbaedeea902ac6da5c6dd5 to your computer and use it in GitHub Desktop.
adapt html for use in pod =for html paragraph
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/perl | |
| use strict; | |
| use warnings; | |
| use 5.008; | |
| use Getopt::Long; | |
| use URI; | |
| use HTML::TreeBuilder; | |
| my $base_uri; | |
| my $wiki_uri; | |
| sub mkcomment | |
| { | |
| my $comment = shift; | |
| [ "~comment" => { text => $comment } ]; | |
| } | |
| sub process_node | |
| { | |
| my $elt = shift; | |
| return if $elt->is_empty; | |
| $elt->normalize_content; # Make sure text is contiguous | |
| my $content = $elt->content_array_ref; | |
| #print STDERR "#children: ", 0 + @$content, "\n"; | |
| # Iterate in reverse order, because we insert elements | |
| for (my $i = @$content - 1; $i >= 0; --$i) { | |
| #print STDERR "child: ", $i, "\n"; | |
| if (ref $content->[$i]) { | |
| my $child = $content->[$i]; | |
| if ($child->tag eq "~comment") { | |
| my $comment = $child->attr("text"); | |
| if ($comment =~ s/\n\s*\n/\n/g) { | |
| $elt->splice_content( | |
| $i, 1, | |
| mkcomment($comment) | |
| ); | |
| } | |
| } else { | |
| # It's a child element, process it recursively: | |
| process_node($child); | |
| if (defined $wiki_uri && $child->tag eq 'a') { | |
| #print STDERR $uri, "\n"; | |
| my $href = $child->attr("href"); | |
| if (defined $href && !($href =~ /^[#]/)) { | |
| my $uri = URI->new_abs( $href, $base_uri ); | |
| if ($uri ne $href) { | |
| $uri = $uri->rel($wiki_uri); | |
| $uri =~ s/[.]html($|[#?])/$1/; | |
| $uri =~ s/[\/]/./g; | |
| $uri = URI->new_abs( $uri, $wiki_uri ); | |
| $child->attr("href", $uri); | |
| #print STDERR "href='", $uri, "'\n"; | |
| } | |
| } | |
| #$base_uri | |
| } | |
| } | |
| } else { | |
| # It's text: | |
| my $text = $content->[$i]; | |
| #my $tex2 = $text; | |
| #$tex2 =~ s/\n/|/g; | |
| # Split the text node using "blank line" as a delimiter, | |
| # but keep the matched delimiter as an item prefix: | |
| # whitespaces have to be preserved within <pre>. | |
| # Then insert a comment hash object between the items. | |
| # Also prefix lines that start with '=' to not confuse Pod | |
| my @pieces; | |
| my $pos1 = 0; | |
| # Use lookahead to find overlapping matches and keep the delimiter. | |
| # [^\S\n] matches any whitespace except newline | |
| while ($text =~ /\n(?=[=]|[^\S\n]*\n)/g) { | |
| #print STDERR "", $tex2, "\n", " " x $-[0], "^\n", " " x $+[0], "^\n"; | |
| my $piece = substr($text, $pos1, $+[0] - $pos1); | |
| $pos1 = $+[0]; | |
| #my $piec2 = $piece; | |
| #$piec2 =~ s/\n/|/g; | |
| #print STDERR "'", $piec2, "'\n"; | |
| push(@pieces, $piece); | |
| push(@pieces, mkcomment(" not-end-html ")); | |
| } | |
| #print STDERR "done\n"; | |
| if (@pieces > 0) { | |
| $elt->splice_content( | |
| $i, 1, # Replace this text element with... | |
| @pieces, substr($text, $pos1) | |
| ); | |
| } | |
| } # end else text | |
| } # end for $i in content index | |
| } # end process_node | |
| sub getopts { | |
| GetOptions ("base-uri=s" => \$base_uri, | |
| "wiki-uri=s" => \$wiki_uri | |
| ) | |
| or die("Error in command line arguments\n"); | |
| } | |
| getopts; | |
| $base_uri //= $wiki_uri; | |
| # \\/ | |
| my $content; | |
| { | |
| local *FH; | |
| open FH, "$ARGV[0]" or die $!; | |
| -f FH and sysread FH, $content, -s FH; | |
| } | |
| # Wrap the SHTML fragment so the comments don't move: | |
| my $html = HTML::TreeBuilder->new; | |
| $html->store_comments(1); | |
| # cannot use no_space_compacting, because closing </li> is stripped, possibly leaving a blank line | |
| #$html->no_space_compacting(1); | |
| $html->parse("<html><body>$content</body></html>"); | |
| my $body = $html->look_down(qw(_tag body)); | |
| process_node($body); | |
| # Now strip the wrapper to get the SHTML fragment back: | |
| $content = $body->as_HTML; | |
| $content =~ s!^<body>\n?!!; | |
| $content =~ s!</body>\s*\z!!; | |
| print STDOUT "=pod\n\n=encoding UTF-8\n\n=for html\n"; | |
| print STDOUT $content; # Replace STDOUT with a suitable filehandle | |
| print STDOUT "\n"; | |
| print STDOUT "=cut\n"; |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/bin/bash | |
| if [ ! -f overview-summary.html ]; then | |
| >&2 echo not found overview-summary.html | |
| exit 1 | |
| fi | |
| find . -name "*.html" | while read p; do | |
| baseuri=${p%/*} | |
| baseuri=${wikiuri:?}${baseuri#.}/ | |
| p=${p#./} | |
| outfile=~/prod-watermark.wiki/${p//\//.} | |
| outfile=${outfile%.html}.pod | |
| ~/html4pod.pl --base-uri ${baseuri:?} --wiki-uri ${wikiuri:?}/ ${p} >${outfile:?} | |
| done |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment