Last active
June 12, 2025 23:13
-
-
Save bsidhom/dae50ecc0062a7a1202469860c8eea89 to your computer and use it in GitHub Desktop.
Write an outline specified in JSON format into a PDF document using qpdf
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
# TODO: Add more documentation for this script. | |
from __future__ import annotations | |
import argparse | |
import collections | |
import itertools | |
import json | |
import re | |
import sys | |
def main(): | |
parser = argparse.ArgumentParser( | |
description= | |
"Shift page numbers in a JSON outline based on \"seams\" in a book " | |
"(gaps in page numbers). Note that these gaps are expected to be " | |
"monotonically increasing.") | |
parser.add_argument( | |
"--outline", | |
help="Outline file with nominal pages (in JSON format).", | |
required=True) | |
parser.add_argument( | |
"--seams", | |
help="Seams as JSON array of arrays of the form [nominal, physical] " | |
"page numbers. Note that \"physical\" here means the output PDF index. " | |
"\"Nominal\" is the page number that appears on a page or in the book " | |
"index/TOC.", | |
required=True) | |
args = parser.parse_args() | |
page_map = read_page_map(args.seams) | |
outline = read_outline(args.outline) | |
adjust_outline(page_map, outline) | |
write_outline(outline) | |
def read_page_map(seams_file) -> PageMap: | |
with open(seams_file) as f: | |
seams = json.load(f) | |
return PageMap(seams) | |
def read_outline(fname): | |
with open(fname) as f: | |
return json.load(f) | |
def adjust_outline(page_map, outline): | |
for entry in outline: | |
adjust_entry(page_map, entry) | |
def adjust_entry(page_map, entry): | |
entry["dest"] = page_map.resolve(entry["dest"]) | |
if "children" in entry: | |
for child in entry["children"]: | |
adjust_entry(page_map, child) | |
def write_outline(outline): | |
json.dump(outline, sys.stdout) | |
class PageMap: | |
def __init__(self, seams): | |
if len(seams) == 0: | |
raise Exception("must have at least one page map entry") | |
seams = sorted(seams, key=lambda x: x[0]) | |
self._seams = tuple(tuple(seam) for seam in seams) | |
def resolve(self, page: int) -> int: | |
# We assume that everything up to the _first_ marked "seam" uses the | |
# same nominal-to-physical offset. After that point, seams are marked in | |
# order of increasing nominal page number. | |
result = page + self._seams[0][1] - self._seams[0][0] | |
# There are typically few seams (missing pages) in a book, so we don't | |
# bother with binary search. | |
for nominal, physical in self._seams[1:]: | |
if nominal > page: | |
break | |
result = physical - nominal + page | |
# Finally, the physical number reported is 1-indexed (since PDF readers | |
# number pages this way). However, when writing PDF outlines, we need to | |
# apply 0-indexed page numbers. | |
return result - 1 | |
if __name__ == "__main__": | |
main() |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
from __future__ import annotations | |
import argparse | |
import json | |
import sys | |
def main(): | |
parser = argparse.ArgumentParser("Rewrite PDF outlines") | |
parser.add_argument("--json", | |
help="JSON file created by qpdf", | |
required=True) | |
parser.add_argument("--outline", | |
help="Your new outline file, in JSON format", | |
required=True) | |
parser.add_argument("--input", | |
help="Original input PDF file to update", | |
required=True) | |
parser.add_argument( | |
"--offset", | |
help="Page offset to add to each target in the outline JSON", | |
default=0) | |
args = parser.parse_args() | |
j = build_output_json(args.json, args.outline, args.offset) | |
json.dump(j, sys.stdout) | |
sys.stdout.flush() | |
def build_output_json(json_fname: str, outline_fname: str, offset: int): | |
with open(json_fname) as f: | |
j = json.load(f) | |
with open(outline_fname) as f: | |
outline = json.load(f) | |
pages = [page["object"] for page in j["pages"]] | |
next_object_id = j["qpdf"][0]["maxobjectid"] + 1 | |
ids = ObjectIdAllocator(next_object_id) | |
catalog = get_catalog(j) | |
outlines_id = ids.next_id() | |
outlines = insert_new_object(j, outlines_id) | |
outlines["/Type"] = "/Outlines" | |
bookmarks = [] | |
for item in outline: | |
bookmark = add_outline_item(j, pages, item, outlines_id, offset, ids) | |
bookmarks.append(bookmark) | |
for ((id, bookmark), (next_id, | |
next_bookmark)) in zip(bookmarks, bookmarks[1:]): | |
bookmark["/Next"] = f"{next_id} 0 R" | |
next_bookmark["/Prev"] = f"{id} 0 R" | |
catalog["/Outlines"] = f"{outlines_id} 0 R" | |
first_id = bookmarks[0][0] | |
outlines["/First"] = f"{first_id} 0 R" | |
last_id = bookmarks[-1][0] | |
outlines["/Last"] = f"{last_id} 0 R" | |
outlines["/Count"] = len(bookmarks) | |
return j | |
def get_catalog(j): | |
objs = j["qpdf"][1] | |
for (k, v) in objs.items(): | |
if not k.startswith("obj:"): | |
continue | |
if "value" not in v: | |
continue | |
v = v["value"] | |
if "/Type" not in v: | |
continue | |
if v["/Type"] == "/Catalog": | |
return v | |
raise Exception("could not find a PDF /Catalog") | |
def add_outline_item(j, pages, item, parent_id, offset: int, | |
ids: ObjectIdAllocator): | |
id = ids.next_id() | |
title = item["title"] | |
page_num = item["dest"] | |
page_ref = pages[page_num + offset] | |
bookmark = insert_new_object(j, id) | |
bookmark["/Dest"] = [page_ref, "/XYZ", None, None, None] | |
bookmark["/Parent"] = f"{parent_id} 0 R" | |
bookmark["/Title"] = f"u:{title}" | |
if "children" in item: | |
children = [] | |
for child in item["children"]: | |
bm = add_outline_item(j, pages, child, id, offset, ids) | |
children.append(bm) | |
for ((child_id, bm), (next_child_id, | |
next_bm)) in zip(children, children[1:]): | |
bm["/Next"] = f"{next_child_id} 0 R" | |
next_bm["/Prev"] = f"{child_id} 0 R" | |
first_id = children[0][0] | |
bookmark["/First"] = f"{first_id} 0 R" | |
last_id = children[-1][0] | |
bookmark["/Last"] = f"{last_id} 0 R" | |
bookmark["/Count"] = len(children) | |
return (id, bookmark) | |
def insert_new_object(j, id): | |
key = f"obj:{id} 0 R" | |
obj = {} | |
j["qpdf"][1][key] = obj | |
value = {} | |
obj["value"] = value | |
return value | |
class ObjectIdAllocator(): | |
def __init__(self, next_id: int): | |
self._next_id = next_id | |
def next_id(self): | |
id = self._next_id | |
self._next_id += 1 | |
return id | |
if __name__ == "__main__": | |
main() |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Clean up PDF for ingestion | |
qpdf --decrypt --object-streams=disable original.pdf in.pdf | |
# Create JSON dump of relevant metadata | |
qpdf --json in.pdf in.json | |
# Create outline JSON | |
vim outline.json | |
# Or, alternatively, create the outline as an indented text file and convert it to JSON | |
vim outline.txt && ./text_to_json.py --increasing-page-numbers <outline.txt | jq --slurp '.' | sponge outline.txt | |
# Write outline data into JSON dump, overwriting old outline if any. | |
./rewrite-pdf-outline.py --json in.json --outline outline.json --input in.pdf >out.json | |
# Write output JSON data into final PDF. | |
qpdf in.pdf out.pdf --update-from-json=out.json |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
import argparse | |
import collections | |
import itertools | |
import json | |
import re | |
import sys | |
def main(): | |
parser = argparse.ArgumentParser( | |
description="Convert a text outline to JSON") | |
parser.add_argument("--increasing-page-numbers", action="store_true") | |
args = parser.parse_args() | |
render_entries(read_entries(args.increasing_page_numbers)) | |
def read_entries(validate_increasing_page_numbers): | |
prev_page = None | |
indent_stack = initialize_indent_stack() | |
for (line_number, line) in enumerate(sys.stdin): | |
space, title, page = parse_line(line) | |
if validate_increasing_page_numbers and prev_page is not None and page < prev_page: | |
raise Exception(f"decreasing page number at line {line_number}") | |
update_indent_stack(space, indent_stack, line_number) | |
depth = len(indent_stack) | |
yield (depth, title, page) | |
prev_page = page | |
WHITESPACE = re.compile(r"\s+") | |
def parse_line(line): | |
line = line.rstrip() | |
title, page = line.rsplit(maxsplit=1) | |
m = WHITESPACE.match(title) | |
space = "" | |
if m: | |
space = m.group(0) | |
# Does Python use utf-32? | |
title = title[len(space):] | |
page = int(page) | |
return (space, title, page) | |
def initialize_indent_stack(): | |
return [] | |
def update_indent_stack(space, indent_stack, line_number): | |
# We don't care _which_ characters are used to indent as long as they are | |
# consistent at each level. | |
if len(space) == 0: | |
# Zero out any existing indentation. | |
indent_stack.clear() | |
elif len(indent_stack) == 0: | |
# We have non-empty leading space but an empty stack, so this is the | |
# first level of indentation. | |
indent_stack.append(space) | |
else: | |
# We have a non-empty indentation stack _and_ non-empty leading space. | |
# We need to confirm that there's some level of shared prefix with | |
# existing indentation. | |
last_indent = indent_stack[-1] | |
if len(space) > len(last_indent): | |
# Deeper level of indentation than before. This is only valid if it | |
# starts with the pevious indentation characters. | |
if space.startswith(last_indent): | |
indent_stack.append(space) | |
else: | |
raise Exception(f"invalid indentation at line {line_number}") | |
else: | |
# This must correspond to a _shallower_ level of indentation than | |
# before and, moreover, must _exactly_ match one of our previous | |
# indentation levels. (It's not valid to de-indent to some previous | |
# depth and then re-indent to a deeper level with new space | |
# characters). | |
if not last_indent.startswith(space): | |
raise Exception( | |
f"invalid de-indentation at line {line_number}") | |
while len(last_indent) != len(space): | |
# NOTE: We rely on preconditions here to avoid more assertions | |
# about stack state. | |
indent_stack.pop() | |
last_indent = indent_stack[-1] | |
def render_entries(entries): | |
for entry in build_entries(pair_with_next_depth(entries)): | |
render_entry(entry) | |
def pair_with_next_depth(entries): | |
for (entry, | |
next_entry) in sliding_window(itertools.chain(entries, (None, )), 2): | |
if next_entry is None: | |
yield (entry, None) | |
else: | |
yield (entry, next_entry[0]) | |
def sliding_window(iterator, n): | |
window = collections.deque(itertools.islice(iterator, n - 1), maxlen=n) | |
for x in iterator: | |
window.append(x) | |
yield tuple(window) | |
def build_entries(entries): | |
# Use an explicit stack frome so we can easily jump up multiple levels as | |
# needed to resume building ancestors after popping out of deeper levels. | |
stack = [] # (depth, entry_dict) tuples | |
def finalize_completed_entries(target_depth=None): | |
# Pop and finalize entries from stack that are at or deeper than | |
# target_depth. If target_depth is None (we're at the outermost frame), | |
# finalize all entries. | |
while stack and (target_depth is None or target_depth <= stack[-1][0]): | |
completed_depth, completed_entry = stack.pop() | |
if stack: | |
# Parent lives in the next-shallower frame. | |
parent_depth, parent_entry = stack[-1] | |
if "children" not in parent_entry: | |
parent_entry["children"] = [] | |
parent_entry["children"].append(completed_entry) | |
else: | |
# We're at the outer stack frame. Yield any remaining entries | |
# directly. | |
yield completed_entry | |
for ((depth, title, page), next_depth) in entries: | |
# If we've returned to a shallower depth, pop and yield any completed | |
# entries before continuing to process the current entry. | |
yield from finalize_completed_entries(depth) | |
entry = {"title": title, "dest": page} | |
stack.append((depth, entry)) | |
# Yield any remaining entries in stack | |
yield from finalize_completed_entries() | |
def render_entry(entry): | |
json.dump(entry, sys.stdout) | |
if __name__ == "__main__": | |
main() |
@Nighel123 I believe it's fixed now. The issue was indeed that partially-completed ancestors weren't being finalized correctly. I've switched to an explicit stack which makes this a bit more transparent and easier to manage.
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
I spoke too soon. It comes out as valid JSON, but I'm guessing the unnesting doesn't happen when there's more than one step. Looking into it now.