-
-
Save jmbarbier/38224087bdab5cfdb1ff94ad27528e40 to your computer and use it in GitHub Desktop.
Wrapper to add page numbers to TOC and links for wkhtmltopdf processing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# wrapper for wkhtmltopdf and pdftk to update links with actual page numbers | |
# TOC entries get leader dots and page number . . . NN | |
# internal links (#id) get [page NN] | |
# written by Phil M Perry | |
# (c) copyright 2015, Phil M Perry | |
# license: GNU Lesser General Public License (LGPL) v3 | |
# | |
use warnings; | |
# it's a quick & dirty job that doesn't have many safeguards. use at your own | |
# risk. anyone is welcome to improve upon it! | |
# | |
# input: HTML file to be converted (parameter 1) | |
# WARNING: input file will be overwritten several times! save a backup first. | |
# output: PDF file (parameter 2) | |
# requires: toPDF.bat, wkhtmltopdf.exe, pdftk.exe | |
# limitations: each link (<a>...</a>) must be on one line | |
# *TOC links (to receive leader dots) include class="toc" | |
# any nbsp's used to indent TOC entries must be within link label | |
# or else the leader dots and page numbers won't align | |
# (and use only 0 or more 's to left pad link label) | |
# assumes all links have href= (no name= anchor "links") | |
# *link with class="external" do not get [page NN] | |
# <A>, </A>, CLASS=, HREF=, etc. not recognized (must be l/c) | |
# TBD: could use regex to do case-insensitive matches? | |
# * see configuration | |
# | |
# configuration ========================================================== | |
# haven't figured out yet why replacing separate .bat file with inline isn't | |
# working... get very cryptic errors. for now, using toPDF.bat | |
# where to find wkhtmltopdf | |
##$wkhtmltopdf = "\"C:\\Program Files\\wkhtmltopdf\\bin\\wkhtmltopdf.exe\""; | |
# parameters for wkhtmltopdf | |
##$WKparms = "-s Letter " . # paper size | |
## "-B .5in -T .5in -L .5in -R .5in " . # margins | |
## "--header-right \"page [page]\"" . # page number at upper right | |
## "--footer-center \"Confidential, property of someone\" | |
## "--header-spacing 4 --footer-spacing 4"; | |
# default class="toc" to mark a TOC entry line | |
$tocClass = "toc"; | |
# default class="external" to mark a link that is NOT to get [page NN] | |
# TBD: look at href= and if it starts with #, it's internal (no extClass needed) | |
$extClass = "external"; | |
# when the (TOC) page number updated with the actual number, it is left | |
# justified (i.e., does not ovewrite any leader dots), so be sure to leave | |
# room for the largest page number! for narrower pages, reduce size. | |
$TOCtemplate = ". . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 9999"; | |
# default page template for internal links, added to end of link label | |
$page99 = " [page 99]"; | |
# name of scratch file for HTML rewrite | |
$tempHTML = "TEMP.HTML"; | |
# name of scratch file for PDF unpack and analyze (a big file!) | |
$tempPDF = "TEMP.PDF"; | |
# minimum 2 passes needed. it's rare to go more than 3 | |
$maxLoops = 10; | |
# 0: leave (modified) inFile, 1: erase it when done | |
$eraseInFile = 1; | |
# end configuration ====================================================== | |
# | |
if ($#ARGV != 1) { | |
print "require input and output file names\n"; | |
exit; | |
} | |
$inFile = $ARGV[0]; | |
$outFile = $ARGV[1]; | |
$tempFile = $tempPDF; | |
# | |
# initialize multipass variables | |
@objList = (); | |
# | |
# initial update of input HTML file is: | |
# TOC <a> (with class="toc") add leader dots and 99 page number | |
# <a> links without class="external" add [page 99] | |
# other <a> (external sites) leave alone | |
# note that this section will need to be updated for any other formats | |
# of HTML file being processed | |
print "adding initial page numbers to input HTML file\n"; | |
initialUpdateHTML($inFile); | |
for ($loop=1; $loop<=$maxLoops; $loop++) { | |
print "pass $loop: create PDF and unpack it\n"; | |
$pageChange = 0; # no page change seen yet in this loop | |
# make PDF | |
system("toPDF $inFile $outFile"); | |
##system("$wkhtmltopdf $WKparms $inFile $outFile"); | |
# uncompress the PDF file | |
system("pdftk $outFile output $tempFile uncompress"); | |
# warning! $tempFile can be huge! | |
# tempFile is the readable, uncompressed PDF produced by pdftk | |
open(IN, $tempFile) || die "can't open temp file $tempFile\n"; | |
# go through tempFile and get the lines starting with /file | |
# save #id objid objid2 at end | |
# we may not need to do this more than once, but if an id moves from | |
# one page to another, it is conceivable that the objid may change from | |
# pass to pass | |
print " build list of targeted objects\n"; | |
while ($line=<IN>) { | |
if ($line =~ m#^/file:.*\#23(.*) (.*) (.*) R#) { | |
if ($loop == 1) { | |
# first time, initialize (build objList) | |
# page -1 (uninitialized page number) | |
push (@objList, [$1, $2, $3, -1]); | |
} else { | |
# subsequent pass, update object id if necessary | |
for ($i=0; $i<=$#objList; $i++) { | |
if ($objList[$i][0] eq $1) { | |
if ($objList[$i][1] != $2 || $objList[$i][2] != $3) { | |
print "#$1 has changed object id from $objList[$i][1] $objList[$i][2] to $2 $3\n"; | |
$objList[$i][1] = $2; | |
$objList[$i][2] = $3; | |
} | |
} | |
} | |
} | |
} # end processing a /file: line | |
} # end of loop through lines in tempFile | |
seek(IN, 0, 0); # we'll read tempFile again | |
# now read through to find each object id that's in objList, and record | |
# its page number. set flag if changed. | |
print " update list of targeted objects with their page numbers\n"; | |
$page = 0; | |
while ($line=<IN>) { | |
if ($line =~ m#^/pdftk_PageNum (.*)#) { | |
$page = $1; | |
next; | |
} | |
if ($line =~ m#^(.*) (.*) obj #) { | |
# found an obj... is it in objList? | |
for ($i=0; $i<=$#objList; $i++) { | |
if ($objList[$i][1] == $1 && $objList[$i][2] == $2) { | |
if ($objList[$i][3] != $page) { | |
if ($loop > 1) { | |
print "#$objList[$i][0] has changed page number from $objList[$i][3] to $page\n"; | |
} | |
$objList[$i][3] = $page; | |
$pageChange = 1; | |
last; # exit for loop, since this line has been processed | |
} | |
} | |
} | |
} # end of processing an object line | |
} # end of loop through lines in tempFile | |
close(IN); | |
unlink($tempFile); | |
if (!$pageChange) { last; } # no page numbers changed? done! | |
# note: we are not checking if an id disappeared or was new in later passes... | |
# assume that set of ids is constant, and that object id and page may change | |
# final step in loop, if page numbers have changed, is to update the input | |
# HTML file's page numbers. | |
print "update page numbers in links and TOC\n"; | |
updateHTML($inFile, \@objList); | |
# too many loops? | |
if ($loop == $maxLoops) { | |
print "Page numbering did not settle down within $maxLoops passes.\n"; | |
print "Use output PDF file with caution.\n"; | |
last; | |
} | |
} # 2+ loops to create PDF/update HTML and repeat | |
if ($eraseInFile) { | |
unlink($inFile); # since inFile has been modified, OK to erase | |
} | |
# end of main program | |
# ============================================================== | |
# add to TOC entries leader dots and "9" at fixed length entry | |
# add to other <a> without class="external" [page 99] | |
# note: <A> and </A> are not matched! | |
# | |
# could use a page number other than 99, such as 999 for large documents | |
# fixed length of TOC entry (number of leader dots) could be varied, depending | |
# on desired page width (TOC will be printed with fixed pitch font to ensure | |
# that dots align horizontally) | |
sub initialUpdateHTML { | |
my $inFile = shift; | |
my $tempFile = $tempHTML; | |
my $me = "initialUpdateHTML()"; | |
my ($line, $pos, $pos2, $tocLine, $label, $len, $i); | |
open(IN, $inFile) || die "$me can't open input file $inFile\n"; | |
open(OUT, ">$tempFile") || die "$me can't open output file $tempFile\n"; | |
while ($line=<IN>) { | |
chomp $line; | |
# assume there might be more than one <a> in a line | |
$pos = 0; | |
while (($pos2 = index($line, "<a ", $pos)) >= 0) { | |
# there IS a[nother] <a> in this line | |
$pos = $pos2; | |
$pos2 = index($line, "</a>", $pos) + 3; # pos..pos2 s/b entire <a> | |
if ($pos2 <= $pos) { | |
print "line =>$line<=\n seems to be missing its </a>!\n"; | |
die "This program cannot continue until HTML source is fixed.\n"; | |
} | |
if (index(substr($line, $pos, $pos2-$pos+1), "class=\"$tocClass\"") >= 0) { | |
# between pos and pos2, is there class="toc"? if so, is TOC | |
$pos = index($line, ">", $pos)+1; # start of link label text | |
$pos2 = index($line, "</a>", $pos)-1; # end of link label text | |
# at this point pos..pos2 is just a label between <a> and </a> | |
$label = substr($line, $pos, $pos2-$pos+1); # raw label | |
$tocLine = $TOCtemplate; | |
# first, convert any leading in the label to spaces, | |
# remembering the number of 's found | |
if ($label =~ m#(( )+)#) { | |
$len = length($1)/6; # count of non-breaking spaces | |
substr($label, 0, $len*6) = ' ' x $len; | |
} else { | |
$len = 0; | |
} | |
# next, overwrite the template at left with the label+one space (nbsp) | |
# pos2-pos+1 is original length of label (with nbsp's) | |
# overwrite first part of tocLine by label and non-breaking space | |
# (so label . . doesn't first get squeezed to label . .) | |
$label .= ' '; | |
# finally, restore $len leading blanks to nbsp's | |
if ($len > 0) { | |
substr($label, 0, $len) = ' ' x $len; | |
} | |
# -5 for trailing nbsp (want to count as one space) | |
# -len*5 for any leading nbsp's (want to count as len spaces) | |
$tocLine = $label . substr($tocLine, length($label)-5-$len*5); | |
$line = substr($line, 0, $pos) . $tocLine . substr($line, $pos2+1); | |
$pos2 += length($tocLine) - ($pos2-$pos+1); # length of added text | |
} elsif (index(substr($line, $pos, $pos2-$pos), "class=\"$extClass\"") < 0) { | |
# between pos and pos2, is there NOT class="external"? if so, add [page] | |
$pos = index($line, ">", $pos)+1; # start of link label text | |
$pos2 = index($line, "</a>", $pos)-1; # end of link label text | |
$line = substr($line, 0, $pos2+1) . $page99 . substr($line, $pos2+1); | |
$pos2 += length($page99); # length of added text | |
} | |
$pos = $pos2; # see if there's another <a> to process | |
} # done (possibly) updating $line | |
print OUT $line."\n"; # output the (sometimes) updated line | |
} # reading input HTML file and writing back out modified version | |
close(IN); | |
close(OUT); | |
rename $tempFile, $inFile; | |
} | |
# ============================================================== | |
# update NN page number at end of TOC entries and [page NN] entries | |
# with current value in objList | |
# | |
# unfortunately, there is no reasonable way to determine which page a link | |
# is on. if there was, we could use "[previous page]" (target's page = link's | |
# page - 1), "[this page]" (target's page = link's page), and "[next page]" | |
# (target's page = links's page + 1) instead of always using "[page NN]". | |
# | |
sub updateHTML { | |
my $inFile = shift; | |
my $objListRef = shift; | |
my @objList = @$objListRef; | |
my $tempFile = $tempHTML; | |
my $me = "updateHTML()"; | |
my ($line, $pos, $pos2, $i, $len); | |
# read in HTML, line by line. requires that full <a...>text</a> is all on | |
# one line | |
open(IN, $inFile) || die "$me can't open input file $inFile\n"; | |
open(OUT, ">$tempFile") || die "$me can't open output file $tempFile\n"; | |
while ($line=<IN>) { | |
chomp $line; | |
# assume there might be more than one <a> in a line | |
$pos = 0; | |
while (($pos2 = index($line, "<a ", $pos)) >= 0) { | |
# there IS a[nother] <a> in this line | |
$pos = $pos2; | |
$pos2 = index($line, "</a>", $pos) + 3; # pos..pos2 s/b entire <a> | |
if ($pos2 <= $pos) { | |
print "line =>$line<=\n seems to be missing its </a>!\n"; | |
die "This program cannot continue until HTML source is fixed.\n"; | |
} | |
if (index(substr($line, $pos, $pos2-$pos+1), "class=\"$tocClass\"") >= 0) { | |
# between pos and pos2, is there class="toc"? if so, is TOC | |
# find href="#id" | |
$pos2 = index($line, "href=", $pos); # assume there IS an href | |
substr($line, $pos2) =~ m/href="#([^"]+)"/; | |
for ($i=0; $i<=$#objList; $i++) { | |
if ($objList[$i][0] eq $1) { last; } # $i is objList row number | |
} | |
$pos = index($line, ">", $pos)+1; # start of link label text | |
$pos2 = index($line, "</a>", $pos)-1; # end of link label text | |
# pos..pos2 is full label between <a> and </a> | |
# we're interested in finding the number at the end and overwriting it | |
$pos = rindex($line, ' ', $pos2)+1; | |
# pos..pos2 should be the old page number \d+ | |
$line = substr($line, 0, $pos) . $objList[$i][3] . substr($line, $pos2+1); | |
$pos2 = index($line, "</a>", $pos); # may have changed slightly | |
} elsif (index(substr($line, $pos, $pos2-$pos), "class=\"$extClass\"") < 0) { | |
# between pos and pos2, is there NOT class="external"? if so, | |
# update page number | |
$pos2 = index($line, "href=", $pos); | |
substr($line, $pos2) =~ m/href="#([^"]+)"/; | |
for ($i=0; $i<=$#objList; $i++) { | |
if ($objList[$i][0] eq $1) { last; } # $i is index in objList | |
} | |
$pos = index($line, ">", $pos)+1; # start of link label text | |
$pos2 = index($line, "</a>", $pos)-1; # end of link label text ']' | |
$pos = rindex($line, " ", $pos2)+6; # pos = start of page number | |
$line = substr($line, 0, $pos) . $objList[$i][3] . substr($line, $pos2); | |
$pos2 = index($line, "</a>", $pos); | |
} | |
$pos = $pos2; # see if there's another <a> to process | |
} # done (possibly) updating $line | |
print OUT $line."\n"; # output the (sometimes) updated line | |
} # reading input HTML file and writing back out modified version | |
close(IN); | |
close(OUT); | |
rename $tempFile, $inFile; | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
echo off | |
set input=%1 | |
set output=%2 | |
"c:\Program Files\wkhtmltopdf\bin\wkhtmltopdf.exe" -s Letter -B .5in -T .5in -L .5in -R .5in --header-right "page [page]" --footer-center "Confidential, property of someone" --header-spacing 4 --footer-spacing 4 %input% %output% |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment