Skip to content

Instantly share code, notes, and snippets.

@bsidhom
Last active June 12, 2025 23:13
Show Gist options
  • Save bsidhom/dae50ecc0062a7a1202469860c8eea89 to your computer and use it in GitHub Desktop.
Save bsidhom/dae50ecc0062a7a1202469860c8eea89 to your computer and use it in GitHub Desktop.
Write an outline specified in JSON format into a PDF document using qpdf
#!/usr/bin/env python3
# TODO: Add more documentation for this script.
from __future__ import annotations
import argparse
import collections
import itertools
import json
import re
import sys
def main():
parser = argparse.ArgumentParser(
description=
"Shift page numbers in a JSON outline based on \"seams\" in a book "
"(gaps in page numbers). Note that these gaps are expected to be "
"monotonically increasing.")
parser.add_argument(
"--outline",
help="Outline file with nominal pages (in JSON format).",
required=True)
parser.add_argument(
"--seams",
help="Seams as JSON array of arrays of the form [nominal, physical] "
"page numbers. Note that \"physical\" here means the output PDF index. "
"\"Nominal\" is the page number that appears on a page or in the book "
"index/TOC.",
required=True)
args = parser.parse_args()
page_map = read_page_map(args.seams)
outline = read_outline(args.outline)
adjust_outline(page_map, outline)
write_outline(outline)
def read_page_map(seams_file) -> PageMap:
with open(seams_file) as f:
seams = json.load(f)
return PageMap(seams)
def read_outline(fname):
with open(fname) as f:
return json.load(f)
def adjust_outline(page_map, outline):
for entry in outline:
adjust_entry(page_map, entry)
def adjust_entry(page_map, entry):
entry["dest"] = page_map.resolve(entry["dest"])
if "children" in entry:
for child in entry["children"]:
adjust_entry(page_map, child)
def write_outline(outline):
json.dump(outline, sys.stdout)
class PageMap:
def __init__(self, seams):
if len(seams) == 0:
raise Exception("must have at least one page map entry")
seams = sorted(seams, key=lambda x: x[0])
self._seams = tuple(tuple(seam) for seam in seams)
def resolve(self, page: int) -> int:
# We assume that everything up to the _first_ marked "seam" uses the
# same nominal-to-physical offset. After that point, seams are marked in
# order of increasing nominal page number.
result = page + self._seams[0][1] - self._seams[0][0]
# There are typically few seams (missing pages) in a book, so we don't
# bother with binary search.
for nominal, physical in self._seams[1:]:
if nominal > page:
break
result = physical - nominal + page
# Finally, the physical number reported is 1-indexed (since PDF readers
# number pages this way). However, when writing PDF outlines, we need to
# apply 0-indexed page numbers.
return result - 1
if __name__ == "__main__":
main()
#!/usr/bin/env python3
from __future__ import annotations
import argparse
import json
import sys
def main():
parser = argparse.ArgumentParser("Rewrite PDF outlines")
parser.add_argument("--json",
help="JSON file created by qpdf",
required=True)
parser.add_argument("--outline",
help="Your new outline file, in JSON format",
required=True)
parser.add_argument("--input",
help="Original input PDF file to update",
required=True)
parser.add_argument(
"--offset",
help="Page offset to add to each target in the outline JSON",
default=0)
args = parser.parse_args()
j = build_output_json(args.json, args.outline, args.offset)
json.dump(j, sys.stdout)
sys.stdout.flush()
def build_output_json(json_fname: str, outline_fname: str, offset: int):
with open(json_fname) as f:
j = json.load(f)
with open(outline_fname) as f:
outline = json.load(f)
pages = [page["object"] for page in j["pages"]]
next_object_id = j["qpdf"][0]["maxobjectid"] + 1
ids = ObjectIdAllocator(next_object_id)
catalog = get_catalog(j)
outlines_id = ids.next_id()
outlines = insert_new_object(j, outlines_id)
outlines["/Type"] = "/Outlines"
bookmarks = []
for item in outline:
bookmark = add_outline_item(j, pages, item, outlines_id, offset, ids)
bookmarks.append(bookmark)
for ((id, bookmark), (next_id,
next_bookmark)) in zip(bookmarks, bookmarks[1:]):
bookmark["/Next"] = f"{next_id} 0 R"
next_bookmark["/Prev"] = f"{id} 0 R"
catalog["/Outlines"] = f"{outlines_id} 0 R"
first_id = bookmarks[0][0]
outlines["/First"] = f"{first_id} 0 R"
last_id = bookmarks[-1][0]
outlines["/Last"] = f"{last_id} 0 R"
outlines["/Count"] = len(bookmarks)
return j
def get_catalog(j):
objs = j["qpdf"][1]
for (k, v) in objs.items():
if not k.startswith("obj:"):
continue
if "value" not in v:
continue
v = v["value"]
if "/Type" not in v:
continue
if v["/Type"] == "/Catalog":
return v
raise Exception("could not find a PDF /Catalog")
def add_outline_item(j, pages, item, parent_id, offset: int,
ids: ObjectIdAllocator):
id = ids.next_id()
title = item["title"]
page_num = item["dest"]
page_ref = pages[page_num + offset]
bookmark = insert_new_object(j, id)
bookmark["/Dest"] = [page_ref, "/XYZ", None, None, None]
bookmark["/Parent"] = f"{parent_id} 0 R"
bookmark["/Title"] = f"u:{title}"
if "children" in item:
children = []
for child in item["children"]:
bm = add_outline_item(j, pages, child, id, offset, ids)
children.append(bm)
for ((child_id, bm), (next_child_id,
next_bm)) in zip(children, children[1:]):
bm["/Next"] = f"{next_child_id} 0 R"
next_bm["/Prev"] = f"{child_id} 0 R"
first_id = children[0][0]
bookmark["/First"] = f"{first_id} 0 R"
last_id = children[-1][0]
bookmark["/Last"] = f"{last_id} 0 R"
bookmark["/Count"] = len(children)
return (id, bookmark)
def insert_new_object(j, id):
key = f"obj:{id} 0 R"
obj = {}
j["qpdf"][1][key] = obj
value = {}
obj["value"] = value
return value
class ObjectIdAllocator():
def __init__(self, next_id: int):
self._next_id = next_id
def next_id(self):
id = self._next_id
self._next_id += 1
return id
if __name__ == "__main__":
main()
# Clean up PDF for ingestion
qpdf --decrypt --object-streams=disable original.pdf in.pdf
# Create JSON dump of relevant metadata
qpdf --json in.pdf in.json
# Create outline JSON
vim outline.json
# Or, alternatively, create the outline as an indented text file and convert it to JSON
vim outline.txt && ./text_to_json.py --increasing-page-numbers <outline.txt | jq --slurp '.' | sponge outline.txt
# Write outline data into JSON dump, overwriting old outline if any.
./rewrite-pdf-outline.py --json in.json --outline outline.json --input in.pdf >out.json
# Write output JSON data into final PDF.
qpdf in.pdf out.pdf --update-from-json=out.json
#!/usr/bin/env python3
import argparse
import collections
import itertools
import json
import re
import sys
def main():
parser = argparse.ArgumentParser(
description="Convert a text outline to JSON")
parser.add_argument("--increasing-page-numbers", action="store_true")
args = parser.parse_args()
render_entries(read_entries(args.increasing_page_numbers))
def read_entries(validate_increasing_page_numbers):
prev_page = None
indent_stack = initialize_indent_stack()
for (line_number, line) in enumerate(sys.stdin):
space, title, page = parse_line(line)
if validate_increasing_page_numbers and prev_page is not None and page < prev_page:
raise Exception(f"decreasing page number at line {line_number}")
update_indent_stack(space, indent_stack, line_number)
depth = len(indent_stack)
yield (depth, title, page)
prev_page = page
WHITESPACE = re.compile(r"\s+")
def parse_line(line):
line = line.rstrip()
title, page = line.rsplit(maxsplit=1)
m = WHITESPACE.match(title)
space = ""
if m:
space = m.group(0)
# Does Python use utf-32?
title = title[len(space):]
page = int(page)
return (space, title, page)
def initialize_indent_stack():
return []
def update_indent_stack(space, indent_stack, line_number):
# We don't care _which_ characters are used to indent as long as they are
# consistent at each level.
if len(space) == 0:
# Zero out any existing indentation.
indent_stack.clear()
elif len(indent_stack) == 0:
# We have non-empty leading space but an empty stack, so this is the
# first level of indentation.
indent_stack.append(space)
else:
# We have a non-empty indentation stack _and_ non-empty leading space.
# We need to confirm that there's some level of shared prefix with
# existing indentation.
last_indent = indent_stack[-1]
if len(space) > len(last_indent):
# Deeper level of indentation than before. This is only valid if it
# starts with the pevious indentation characters.
if space.startswith(last_indent):
indent_stack.append(space)
else:
raise Exception(f"invalid indentation at line {line_number}")
else:
# This must correspond to a _shallower_ level of indentation than
# before and, moreover, must _exactly_ match one of our previous
# indentation levels. (It's not valid to de-indent to some previous
# depth and then re-indent to a deeper level with new space
# characters).
if not last_indent.startswith(space):
raise Exception(
f"invalid de-indentation at line {line_number}")
while len(last_indent) != len(space):
# NOTE: We rely on preconditions here to avoid more assertions
# about stack state.
indent_stack.pop()
last_indent = indent_stack[-1]
def render_entries(entries):
for entry in build_entries(pair_with_next_depth(entries)):
render_entry(entry)
def pair_with_next_depth(entries):
for (entry,
next_entry) in sliding_window(itertools.chain(entries, (None, )), 2):
if next_entry is None:
yield (entry, None)
else:
yield (entry, next_entry[0])
def sliding_window(iterator, n):
window = collections.deque(itertools.islice(iterator, n - 1), maxlen=n)
for x in iterator:
window.append(x)
yield tuple(window)
def build_entries(entries):
# Use an explicit stack frome so we can easily jump up multiple levels as
# needed to resume building ancestors after popping out of deeper levels.
stack = [] # (depth, entry_dict) tuples
def finalize_completed_entries(target_depth=None):
# Pop and finalize entries from stack that are at or deeper than
# target_depth. If target_depth is None (we're at the outermost frame),
# finalize all entries.
while stack and (target_depth is None or target_depth <= stack[-1][0]):
completed_depth, completed_entry = stack.pop()
if stack:
# Parent lives in the next-shallower frame.
parent_depth, parent_entry = stack[-1]
if "children" not in parent_entry:
parent_entry["children"] = []
parent_entry["children"].append(completed_entry)
else:
# We're at the outer stack frame. Yield any remaining entries
# directly.
yield completed_entry
for ((depth, title, page), next_depth) in entries:
# If we've returned to a shallower depth, pop and yield any completed
# entries before continuing to process the current entry.
yield from finalize_completed_entries(depth)
entry = {"title": title, "dest": page}
stack.append((depth, entry))
# Yield any remaining entries in stack
yield from finalize_completed_entries()
def render_entry(entry):
json.dump(entry, sys.stdout)
if __name__ == "__main__":
main()
@foolishgrunt
Copy link

Do you have an example outline.json file you could share?

@qooxzuub
Copy link

Do you have an example outline.json file you could share?

This works for me:

[
  {
    "title": "First chapter",
    "dest": 0,
    "children": [
      {
        "title": "Subsection one point one",
        "dest": 1
      },
      {
        "title": "Subsection one point two",
        "dest": 2
      }
    ]
  },
  {
    "title": "Second chapter",
    "dest": 3
  }
]

@bsidhom
Copy link
Author

bsidhom commented Sep 20, 2024

Yes, the above outline should work. I've also just added a script to make it a bit easier to write outlines as plain text files with minimal structure. This should make it easy to directly type up outlines from TOCs, etc.

For example:

Contents 0
1. Chapter 1 1
  1.1. Subchapter 1.1. 2
  1.2 Subchapter 1.2 3
2. Chapter 2 4
  2.1 Subchapter 2.1 5
Index 6

The last token per line is interpreted as the destination page (zero-indexed, as in the standard JSON format).

Since it renders to JSON, you can also do various JSON transformations (e.g., handling conversion of front-matter pagination and main body pagination to different offsets).

@Nighel123
Copy link

There seems to be a problem tough when the indent dephs drops from 2 to 0

For Example:

Contents 0
1. Chapter 1 1
  1.1. Subchapter 1.1. 2
  1.2 Subchapter 1.2 3
2. Chapter 2 4
  2.1 Subchapter 2.1 5
    2.1.1 Subsubchapter 6
Index 7

then the depth in the json drops only one level. I coundn't figure out where the error is, yet.

@bsidhom
Copy link
Author

bsidhom commented Jun 12, 2025

@Nighel123 I just tested out your sample and it works for me. Note that the base script spits out concatenated JSON objects, so you need to pass the --slurp option to jq if you want it to be in a format that Python will understand. I might update/rework this at some point to make it easier to use, because the ergonomics are poor.

@bsidhom
Copy link
Author

bsidhom commented Jun 12, 2025

@bsidhom
Copy link
Author

bsidhom commented Jun 12, 2025

I spoke too soon. It comes out as valid JSON, but I'm guessing the unnesting doesn't happen when there's more than one step. Looking into it now.

@bsidhom
Copy link
Author

bsidhom commented Jun 12, 2025

@Nighel123 I believe it's fixed now. The issue was indeed that partially-completed ancestors weren't being finalized correctly. I've switched to an explicit stack which makes this a bit more transparent and easier to manage.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment