basinilya · June 28, 2018 10:59
diff --git a/html4pod.pl b/html4pod.pl
 #!/usr/bin/perl

 use strict;
 use warnings;
 use 5.008;

 use Getopt::Long;
 use URI;
 use HTML::TreeBuilder;

 my $base_uri;
 my $wiki_uri;

 sub mkcomment
 {
  my $comment = shift;
  [ "~comment" => { text => $comment } ];
 }

 sub process_node
 {
  my $elt = shift;

  return if $elt->is_empty;

  $elt->normalize_content;      # Make sure text is contiguous

  my $content = $elt->content_array_ref;

  #print STDERR "#children: ", 0 + @$content, "\n";

  # Iterate in reverse order, because we insert elements
  for (my $i = @$content - 1; $i >= 0; --$i) {
    #print STDERR "child: ", $i, "\n";
    if (ref $content->[$i]) {
      my $child = $content->[$i];
      if ($child->tag eq "~comment") {
        my $comment = $child->attr("text");
        if ($comment =~ s/\n\s*\n/\n/g) {
          $elt->splice_content(
            $i, 1,
            mkcomment($comment)
          );
        }
      } else {
        # It's a child element, process it recursively:
        process_node($child);

        if (defined $wiki_uri && $child->tag eq 'a') {
          #print STDERR $uri, "\n";
          my $href = $child->attr("href");
          if (defined $href && !($href =~ /^[#]/)) {
            my $uri = URI->new_abs( $href, $base_uri );
            if ($uri ne $href) {
              $uri = $uri->rel($wiki_uri);
              $uri =~ s/[.]html($|[#?])/$1/;
              $uri =~ s/[\/]/./g;
              $uri = URI->new_abs( $uri, $wiki_uri );
              $child->attr("href", $uri);
              #print STDERR "href='", $uri, "'\n";
            }
          }
          #$base_uri
        }
      }
    } else {
      # It's text:
      my $text = $content->[$i];
      #my $tex2 = $text;
      #$tex2 =~ s/\n/|/g;

      # Split the text node using "blank line" as a delimiter,
      # but keep the matched delimiter as an item prefix:
      # whitespaces have to be preserved within <pre>.

      # Then insert a comment hash object between the items.

      # Also prefix lines that start with '=' to not confuse Pod

      my @pieces;
      my $pos1 = 0;
      
      # Use lookahead to find overlapping matches and keep the delimiter.
      # [^\S\n] matches any whitespace except newline
      while ($text =~ /\n(?=[=]|[^\S\n]*\n)/g) {
        #print STDERR "", $tex2, "\n", " " x $-[0], "^\n", " " x $+[0], "^\n";
        my $piece = substr($text, $pos1, $+[0] - $pos1);
        $pos1 = $+[0];
        #my $piec2 = $piece;
        #$piec2 =~ s/\n/|/g;
        #print STDERR "'", $piec2, "'\n";
        push(@pieces, $piece);
        push(@pieces, mkcomment(" not-end-html "));
      }
      #print STDERR "done\n";
      if (@pieces > 0) {
        $elt->splice_content(
          $i, 1, # Replace this text element with...
          @pieces, substr($text, $pos1)
        );
      }
    } # end else text
  } # end for $i in content index
 } # end process_node

 sub getopts {
    GetOptions ("base-uri=s" => \$base_uri,
                "wiki-uri=s" => \$wiki_uri
                )
        or die("Error in command line arguments\n");
 }

 getopts;

 $base_uri //= $wiki_uri;
 # \\/

 my $content;
 {
  local *FH;
  open FH, "$ARGV[0]" or die $!;
  -f FH and sysread FH, $content, -s FH;
 }

 # Wrap the SHTML fragment so the comments don't move:
 my $html = HTML::TreeBuilder->new;
 $html->store_comments(1);

 # cannot use no_space_compacting, because closing </li> is stripped, possibly leaving a blank line
 #$html->no_space_compacting(1);

 $html->parse("<html><body>$content</body></html>");

 my $body = $html->look_down(qw(_tag body));
 process_node($body);

 # Now strip the wrapper to get the SHTML fragment back:
 $content = $body->as_HTML;
 $content =~ s!^<body>\n?!!;
 $content =~ s!</body>\s*\z!!;

 print STDOUT "=pod\n\n=encoding UTF-8\n\n=for html\n";
 print STDOUT $content; # Replace STDOUT with a suitable filehandle
 print STDOUT "\n";
 print STDOUT "=cut\n";
diff --git a/javadoc2pod.sh b/javadoc2pod.sh
 #!/bin/bash

 if [ ! -f overview-summary.html ]; then
  >&2 echo not found overview-summary.html
  exit 1
 fi

 find . -name "*.html" | while read p; do
  baseuri=${p%/*}
  baseuri=${wikiuri:?}${baseuri#.}/
  p=${p#./}
  outfile=~/prod-watermark.wiki/${p//\//.}
  outfile=${outfile%.html}.pod
  ~/html4pod.pl --base-uri ${baseuri:?} --wiki-uri ${wikiuri:?}/ ${p} >${outfile:?}
 done
	#!/usr/bin/perl

	use strict;
	use warnings;
	use 5.008;

	use Getopt::Long;
	use URI;
	use HTML::TreeBuilder;

	my $base_uri;
	my $wiki_uri;

	sub mkcomment
	{
	my $comment = shift;
	[ "~comment" => { text => $comment } ];
	}

	sub process_node
	{
	my $elt = shift;

	return if $elt->is_empty;

	$elt->normalize_content; # Make sure text is contiguous

	my $content = $elt->content_array_ref;

	#print STDERR "#children: ", 0 + @$content, "\n";

	# Iterate in reverse order, because we insert elements
	for (my $i = @$content - 1; $i >= 0; --$i) {
	#print STDERR "child: ", $i, "\n";
	if (ref $content->[$i]) {
	my $child = $content->[$i];
	if ($child->tag eq "~comment") {
	my $comment = $child->attr("text");
	if ($comment =~ s/\n\s*\n/\n/g) {
	$elt->splice_content(
	$i, 1,
	mkcomment($comment)
	);
	}
	} else {
	# It's a child element, process it recursively:
	process_node($child);

	if (defined $wiki_uri && $child->tag eq 'a') {
	#print STDERR $uri, "\n";
	my $href = $child->attr("href");
	if (defined $href && !($href =~ /^[#]/)) {
	my $uri = URI->new_abs( $href, $base_uri );
	if ($uri ne $href) {
	$uri = $uri->rel($wiki_uri);
	$uri =~ s/[.]html($\|[#?])/$1/;
	$uri =~ s/[\/]/./g;
	$uri = URI->new_abs( $uri, $wiki_uri );
	$child->attr("href", $uri);
	#print STDERR "href='", $uri, "'\n";
	}
	}
	#$base_uri
	}
	}
	} else {
	# It's text:
	my $text = $content->[$i];
	#my $tex2 = $text;
	#$tex2 =~ s/\n/\|/g;

	# Split the text node using "blank line" as a delimiter,
	# but keep the matched delimiter as an item prefix:
	# whitespaces have to be preserved within <pre>.

	# Then insert a comment hash object between the items.

	# Also prefix lines that start with '=' to not confuse Pod

	my @pieces;
	my $pos1 = 0;

	# Use lookahead to find overlapping matches and keep the delimiter.
	# [^\S\n] matches any whitespace except newline
	while ($text =~ /\n(?=[=]\|[^\S\n]*\n)/g) {
	#print STDERR "", $tex2, "\n", " " x $-[0], "^\n", " " x $+[0], "^\n";
	my $piece = substr($text, $pos1, $+[0] - $pos1);
	$pos1 = $+[0];
	#my $piec2 = $piece;
	#$piec2 =~ s/\n/\|/g;
	#print STDERR "'", $piec2, "'\n";
	push(@pieces, $piece);
	push(@pieces, mkcomment(" not-end-html "));
	}
	#print STDERR "done\n";
	if (@pieces > 0) {
	$elt->splice_content(
	$i, 1, # Replace this text element with...
	@pieces, substr($text, $pos1)
	);
	}
	} # end else text
	} # end for $i in content index
	} # end process_node

	sub getopts {
	GetOptions ("base-uri=s" => \$base_uri,
	"wiki-uri=s" => \$wiki_uri
	)
	or die("Error in command line arguments\n");
	}

	getopts;

	$base_uri //= $wiki_uri;
	# \\/

	my $content;
	{
	local *FH;
	open FH, "$ARGV[0]" or die $!;
	-f FH and sysread FH, $content, -s FH;
	}

	# Wrap the SHTML fragment so the comments don't move:
	my $html = HTML::TreeBuilder->new;
	$html->store_comments(1);

	# cannot use no_space_compacting, because closing </li> is stripped, possibly leaving a blank line
	#$html->no_space_compacting(1);

	$html->parse("<html><body>$content</body></html>");

	my $body = $html->look_down(qw(_tag body));
	process_node($body);

	# Now strip the wrapper to get the SHTML fragment back:
	$content = $body->as_HTML;
	$content =~ s!^<body>\n?!!;
	$content =~ s!</body>\s*\z!!;

	print STDOUT "=pod\n\n=encoding UTF-8\n\n=for html\n";
	print STDOUT $content; # Replace STDOUT with a suitable filehandle
	print STDOUT "\n";
	print STDOUT "=cut\n";
	#!/bin/bash

	if [ ! -f overview-summary.html ]; then
	>&2 echo not found overview-summary.html
	exit 1
	fi

	find . -name "*.html" \| while read p; do
	baseuri=${p%/*}
	baseuri=${wikiuri:?}${baseuri#.}/
	p=${p#./}
	outfile=~/prod-watermark.wiki/${p//\//.}
	outfile=${outfile%.html}.pod
	~/html4pod.pl --base-uri ${baseuri:?} --wiki-uri ${wikiuri:?}/ ${p} >${outfile:?}
	done