bsidhom · June 12, 2025 23:13 · foolishgrunt · Jul 16, 2024 · qooxzuub · Aug 12, 2024
diff --git a/fix_seams.py b/fix_seams.py
 #!/usr/bin/env python3

 # TODO: Add more documentation for this script.

 from __future__ import annotations

 import argparse
 import collections
 import itertools
 import json
 import re
 import sys


 def main():
    parser = argparse.ArgumentParser(
        description=
        "Shift page numbers in a JSON outline based on \"seams\" in a book "
        "(gaps in page numbers). Note that these gaps are expected to be "
        "monotonically increasing.")
    parser.add_argument(
        "--outline",
        help="Outline file with nominal pages (in JSON format).",
        required=True)
    parser.add_argument(
        "--seams",
        help="Seams as JSON array of arrays of the form [nominal, physical] "
        "page numbers. Note that \"physical\" here means the output PDF index. "
        "\"Nominal\" is the page number that appears on a page or in the book "
        "index/TOC.",
        required=True)
    args = parser.parse_args()
    page_map = read_page_map(args.seams)
    outline = read_outline(args.outline)
    adjust_outline(page_map, outline)
    write_outline(outline)


 def read_page_map(seams_file) -> PageMap:
    with open(seams_file) as f:
        seams = json.load(f)
    return PageMap(seams)


 def read_outline(fname):
    with open(fname) as f:
        return json.load(f)


 def adjust_outline(page_map, outline):
    for entry in outline:
        adjust_entry(page_map, entry)

 def adjust_entry(page_map, entry):
    entry["dest"] = page_map.resolve(entry["dest"])
    if "children" in entry:
        for child in entry["children"]:
            adjust_entry(page_map, child)

 def write_outline(outline):
    json.dump(outline, sys.stdout)


 class PageMap:

    def __init__(self, seams):
        if len(seams) == 0:
            raise Exception("must have at least one page map entry")
        seams = sorted(seams, key=lambda x: x[0])
        self._seams = tuple(tuple(seam) for seam in seams)

    def resolve(self, page: int) -> int:
        # We assume that everything up to the _first_ marked "seam" uses the
        # same nominal-to-physical offset. After that point, seams are marked in
        # order of increasing nominal page number.
        result = page + self._seams[0][1] - self._seams[0][0]
        # There are typically few seams (missing pages) in a book, so we don't
        # bother with binary search.
        for nominal, physical in self._seams[1:]:
            if nominal > page:
                break
            result = physical - nominal + page
        # Finally, the physical number reported is 1-indexed (since PDF readers
        # number pages this way). However, when writing PDF outlines, we need to
        # apply 0-indexed page numbers.
        return result - 1


 if __name__ == "__main__":
    main()
diff --git a/rewrite-pdf-outline.py b/rewrite-pdf-outline.py
 #!/usr/bin/env python3

 from __future__ import annotations

 import argparse
 import json
 import sys


 def main():
    parser = argparse.ArgumentParser("Rewrite PDF outlines")
    parser.add_argument("--json",
                        help="JSON file created by qpdf",
                        required=True)
    parser.add_argument("--outline",
                        help="Your new outline file, in JSON format",
                        required=True)
    parser.add_argument("--input",
                        help="Original input PDF file to update",
                        required=True)
    parser.add_argument(
        "--offset",
        help="Page offset to add to each target in the outline JSON",
        default=0)
    args = parser.parse_args()
    j = build_output_json(args.json, args.outline, args.offset)
    json.dump(j, sys.stdout)
    sys.stdout.flush()


 def build_output_json(json_fname: str, outline_fname: str, offset: int):
    with open(json_fname) as f:
        j = json.load(f)
    with open(outline_fname) as f:
        outline = json.load(f)
    pages = [page["object"] for page in j["pages"]]
    next_object_id = j["qpdf"][0]["maxobjectid"] + 1
    ids = ObjectIdAllocator(next_object_id)
    catalog = get_catalog(j)
    outlines_id = ids.next_id()
    outlines = insert_new_object(j, outlines_id)
    outlines["/Type"] = "/Outlines"
    bookmarks = []
    for item in outline:
        bookmark = add_outline_item(j, pages, item, outlines_id, offset, ids)
        bookmarks.append(bookmark)
    for ((id, bookmark), (next_id,
                          next_bookmark)) in zip(bookmarks, bookmarks[1:]):
        bookmark["/Next"] = f"{next_id} 0 R"
        next_bookmark["/Prev"] = f"{id} 0 R"
    catalog["/Outlines"] = f"{outlines_id} 0 R"
    first_id = bookmarks[0][0]
    outlines["/First"] = f"{first_id} 0 R"
    last_id = bookmarks[-1][0]
    outlines["/Last"] = f"{last_id} 0 R"
    outlines["/Count"] = len(bookmarks)
    return j


 def get_catalog(j):
    objs = j["qpdf"][1]
    for (k, v) in objs.items():
        if not k.startswith("obj:"):
            continue
        if "value" not in v:
            continue
        v = v["value"]
        if "/Type" not in v:
            continue
        if v["/Type"] == "/Catalog":
            return v
    raise Exception("could not find a PDF /Catalog")


 def add_outline_item(j, pages, item, parent_id, offset: int,
                     ids: ObjectIdAllocator):
    id = ids.next_id()
    title = item["title"]
    page_num = item["dest"]
    page_ref = pages[page_num + offset]
    bookmark = insert_new_object(j, id)
    bookmark["/Dest"] = [page_ref, "/XYZ", None, None, None]
    bookmark["/Parent"] = f"{parent_id} 0 R"
    bookmark["/Title"] = f"u:{title}"
    if "children" in item:
        children = []
        for child in item["children"]:
            bm = add_outline_item(j, pages, child, id, offset, ids)
            children.append(bm)
        for ((child_id, bm), (next_child_id,
                              next_bm)) in zip(children, children[1:]):
            bm["/Next"] = f"{next_child_id} 0 R"
            next_bm["/Prev"] = f"{child_id} 0 R"
        first_id = children[0][0]
        bookmark["/First"] = f"{first_id} 0 R"
        last_id = children[-1][0]
        bookmark["/Last"] = f"{last_id} 0 R"
        bookmark["/Count"] = len(children)
    return (id, bookmark)


 def insert_new_object(j, id):
    key = f"obj:{id} 0 R"
    obj = {}
    j["qpdf"][1][key] = obj
    value = {}
    obj["value"] = value
    return value


 class ObjectIdAllocator():

    def __init__(self, next_id: int):
        self._next_id = next_id

    def next_id(self):
        id = self._next_id
        self._next_id += 1
        return id


 if __name__ == "__main__":
    main()
diff --git a/rewrite-pdf-outline.sh b/rewrite-pdf-outline.sh
 # Clean up PDF for ingestion
 qpdf --decrypt --object-streams=disable original.pdf in.pdf
 # Create JSON dump of relevant metadata
 qpdf --json in.pdf in.json
 # Create outline JSON
 vim outline.json
 # Or, alternatively, create the outline as an indented text file and convert it to JSON
 vim outline.txt && ./text_to_json.py --increasing-page-numbers <outline.txt | jq --slurp '.' | sponge outline.txt
 # Write outline data into JSON dump, overwriting old outline if any.
 ./rewrite-pdf-outline.py --json in.json --outline outline.json --input in.pdf >out.json
 # Write output JSON data into final PDF.
 qpdf in.pdf out.pdf --update-from-json=out.json
diff --git a/text_to_json.py b/text_to_json.py
 #!/usr/bin/env python3

 import argparse
 import collections
 import itertools
 import json
 import re
 import sys


 def main():
    parser = argparse.ArgumentParser(
        description="Convert a text outline to JSON")
    parser.add_argument("--increasing-page-numbers", action="store_true")
    args = parser.parse_args()
    render_entries(read_entries(args.increasing_page_numbers))


 def read_entries(validate_increasing_page_numbers):
    prev_page = None
    indent_stack = initialize_indent_stack()
    for (line_number, line) in enumerate(sys.stdin):
        space, title, page = parse_line(line)
        if validate_increasing_page_numbers and prev_page is not None and page < prev_page:
            raise Exception(f"decreasing page number at line {line_number}")
        update_indent_stack(space, indent_stack, line_number)
        depth = len(indent_stack)
        yield (depth, title, page)
        prev_page = page


 WHITESPACE = re.compile(r"\s+")


 def parse_line(line):
    line = line.rstrip()
    title, page = line.rsplit(maxsplit=1)
    m = WHITESPACE.match(title)
    space = ""
    if m:
        space = m.group(0)
        # Does Python use utf-32?
        title = title[len(space):]
    page = int(page)
    return (space, title, page)


 def initialize_indent_stack():
    return []


 def update_indent_stack(space, indent_stack, line_number):
    # We don't care _which_ characters are used to indent as long as they are
    # consistent at each level.
    if len(space) == 0:
        # Zero out any existing indentation.
        indent_stack.clear()
    elif len(indent_stack) == 0:
        # We have non-empty leading space but an empty stack, so this is the
        # first level of indentation.
        indent_stack.append(space)
    else:
        # We have a non-empty indentation stack _and_ non-empty leading space.
        # We need to confirm that there's some level of shared prefix with
        # existing indentation.
        last_indent = indent_stack[-1]
        if len(space) > len(last_indent):
            # Deeper level of indentation than before. This is only valid if it
            # starts with the pevious indentation characters.
            if space.startswith(last_indent):
                indent_stack.append(space)
            else:
                raise Exception(f"invalid indentation at line {line_number}")
        else:
            # This must correspond to a _shallower_ level of indentation than
            # before and, moreover, must _exactly_ match one of our previous
            # indentation levels. (It's not valid to de-indent to some previous
            # depth and then re-indent to a deeper level with new space
            # characters).
            if not last_indent.startswith(space):
                raise Exception(
                    f"invalid de-indentation at line {line_number}")
            while len(last_indent) != len(space):
                # NOTE: We rely on preconditions here to avoid more assertions
                # about stack state.
                indent_stack.pop()
                last_indent = indent_stack[-1]


 def render_entries(entries):
    for entry in build_entries(pair_with_next_depth(entries)):
        render_entry(entry)


 def pair_with_next_depth(entries):
    for (entry,
         next_entry) in sliding_window(itertools.chain(entries, (None, )), 2):
        if next_entry is None:
            yield (entry, None)
        else:
            yield (entry, next_entry[0])


 def sliding_window(iterator, n):
    window = collections.deque(itertools.islice(iterator, n - 1), maxlen=n)
    for x in iterator:
        window.append(x)
        yield tuple(window)


 def build_entries(entries):
    # Use an explicit stack frome so we can easily jump up multiple levels as
    # needed to resume building ancestors after popping out of deeper levels.
    stack = []  # (depth, entry_dict) tuples

    def finalize_completed_entries(target_depth=None):
        # Pop and finalize entries from stack that are at or deeper than
        # target_depth. If target_depth is None (we're at the outermost frame),
        # finalize all entries.
        while stack and (target_depth is None or target_depth <= stack[-1][0]):
            completed_depth, completed_entry = stack.pop()
            if stack:
                # Parent lives in the next-shallower frame.
                parent_depth, parent_entry = stack[-1]
                if "children" not in parent_entry:
                    parent_entry["children"] = []
                parent_entry["children"].append(completed_entry)
            else:
                # We're at the outer stack frame. Yield any remaining entries
                # directly.
                yield completed_entry

    for ((depth, title, page), next_depth) in entries:
        # If we've returned to a shallower depth, pop and yield any completed
        # entries before continuing to process the current entry.
        yield from finalize_completed_entries(depth)
        entry = {"title": title, "dest": page}
        stack.append((depth, entry))

    # Yield any remaining entries in stack
    yield from finalize_completed_entries()


 def render_entry(entry):
    json.dump(entry, sys.stdout)


 if __name__ == "__main__":
    main()
	#!/usr/bin/env python3

	# TODO: Add more documentation for this script.

	from __future__ import annotations

	import argparse
	import collections
	import itertools
	import json
	import re
	import sys


	def main():
	parser = argparse.ArgumentParser(
	description=
	"Shift page numbers in a JSON outline based on \"seams\" in a book "
	"(gaps in page numbers). Note that these gaps are expected to be "
	"monotonically increasing.")
	parser.add_argument(
	"--outline",
	help="Outline file with nominal pages (in JSON format).",
	required=True)
	parser.add_argument(
	"--seams",
	help="Seams as JSON array of arrays of the form [nominal, physical] "
	"page numbers. Note that \"physical\" here means the output PDF index. "
	"\"Nominal\" is the page number that appears on a page or in the book "
	"index/TOC.",
	required=True)
	args = parser.parse_args()
	page_map = read_page_map(args.seams)
	outline = read_outline(args.outline)
	adjust_outline(page_map, outline)
	write_outline(outline)


	def read_page_map(seams_file) -> PageMap:
	with open(seams_file) as f:
	seams = json.load(f)
	return PageMap(seams)


	def read_outline(fname):
	with open(fname) as f:
	return json.load(f)


	def adjust_outline(page_map, outline):
	for entry in outline:
	adjust_entry(page_map, entry)

	def adjust_entry(page_map, entry):
	entry["dest"] = page_map.resolve(entry["dest"])
	if "children" in entry:
	for child in entry["children"]:
	adjust_entry(page_map, child)

	def write_outline(outline):
	json.dump(outline, sys.stdout)


	class PageMap:

	def __init__(self, seams):
	if len(seams) == 0:
	raise Exception("must have at least one page map entry")
	seams = sorted(seams, key=lambda x: x[0])
	self._seams = tuple(tuple(seam) for seam in seams)

	def resolve(self, page: int) -> int:
	# We assume that everything up to the _first_ marked "seam" uses the
	# same nominal-to-physical offset. After that point, seams are marked in
	# order of increasing nominal page number.
	result = page + self._seams[0][1] - self._seams[0][0]
	# There are typically few seams (missing pages) in a book, so we don't
	# bother with binary search.
	for nominal, physical in self._seams[1:]:
	if nominal > page:
	break
	result = physical - nominal + page
	# Finally, the physical number reported is 1-indexed (since PDF readers
	# number pages this way). However, when writing PDF outlines, we need to
	# apply 0-indexed page numbers.
	return result - 1


	if __name__ == "__main__":
	main()
	# Clean up PDF for ingestion
	qpdf --decrypt --object-streams=disable original.pdf in.pdf
	# Create JSON dump of relevant metadata
	qpdf --json in.pdf in.json
	# Create outline JSON
	vim outline.json
	# Or, alternatively, create the outline as an indented text file and convert it to JSON
	vim outline.txt && ./text_to_json.py --increasing-page-numbers <outline.txt \| jq --slurp '.' \| sponge outline.txt
	# Write outline data into JSON dump, overwriting old outline if any.
	./rewrite-pdf-outline.py --json in.json --outline outline.json --input in.pdf >out.json
	# Write output JSON data into final PDF.
	qpdf in.pdf out.pdf --update-from-json=out.json