-
-
Save bsidhom/dae50ecc0062a7a1202469860c8eea89 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python3 | |
# TODO: Add more documentation for this script. | |
from __future__ import annotations | |
import argparse | |
import collections | |
import itertools | |
import json | |
import re | |
import sys | |
def main(): | |
parser = argparse.ArgumentParser( | |
description= | |
"Shift page numbers in a JSON outline based on \"seams\" in a book " | |
"(gaps in page numbers). Note that these gaps are expected to be " | |
"monotonically increasing.") | |
parser.add_argument( | |
"--outline", | |
help="Outline file with nominal pages (in JSON format).", | |
required=True) | |
parser.add_argument( | |
"--seams", | |
help="Seams as JSON array of arrays of the form [nominal, physical] " | |
"page numbers. Note that \"physical\" here means the output PDF index. " | |
"\"Nominal\" is the page number that appears on a page or in the book " | |
"index/TOC.", | |
required=True) | |
args = parser.parse_args() | |
page_map = read_page_map(args.seams) | |
outline = read_outline(args.outline) | |
adjust_outline(page_map, outline) | |
write_outline(outline) | |
def read_page_map(seams_file) -> PageMap: | |
with open(seams_file) as f: | |
seams = json.load(f) | |
return PageMap(seams) | |
def read_outline(fname): | |
with open(fname) as f: | |
return json.load(f) | |
def adjust_outline(page_map, outline): | |
for entry in outline: | |
adjust_entry(page_map, entry) | |
def adjust_entry(page_map, entry): | |
entry["dest"] = page_map.resolve(entry["dest"]) | |
if "children" in entry: | |
for child in entry["children"]: | |
adjust_entry(page_map, child) | |
def write_outline(outline): | |
json.dump(outline, sys.stdout) | |
class PageMap: | |
def __init__(self, seams): | |
if len(seams) == 0: | |
raise Exception("must have at least one page map entry") | |
seams = sorted(seams, key=lambda x: x[0]) | |
self._seams = tuple(tuple(seam) for seam in seams) | |
def resolve(self, page: int) -> int: | |
# We assume that everything up to the _first_ marked "seam" uses the | |
# same nominal-to-physical offset. After that point, seams are marked in | |
# order of increasing nominal page number. | |
result = page + self._seams[0][1] - self._seams[0][0] | |
# There are typically few seams (missing pages) in a book, so we don't | |
# bother with binary search. | |
for nominal, physical in self._seams[1:]: | |
if nominal > page: | |
break | |
result = physical - nominal + page | |
# Finally, the physical number reported is 1-indexed (since PDF readers | |
# number pages this way). However, when writing PDF outlines, we need to | |
# apply 0-indexed page numbers. | |
return result - 1 | |
if __name__ == "__main__": | |
main() |
#!/usr/bin/env python3 | |
from __future__ import annotations | |
import argparse | |
import json | |
import sys | |
def main(): | |
parser = argparse.ArgumentParser("Rewrite PDF outlines") | |
parser.add_argument("--json", | |
help="JSON file created by qpdf", | |
required=True) | |
parser.add_argument("--outline", | |
help="Your new outline file, in JSON format", | |
required=True) | |
parser.add_argument("--input", | |
help="Original input PDF file to update", | |
required=True) | |
parser.add_argument( | |
"--offset", | |
help="Page offset to add to each target in the outline JSON", | |
default=0) | |
args = parser.parse_args() | |
j = build_output_json(args.json, args.outline, args.offset) | |
json.dump(j, sys.stdout) | |
sys.stdout.flush() | |
def build_output_json(json_fname: str, outline_fname: str, offset: int): | |
with open(json_fname) as f: | |
j = json.load(f) | |
with open(outline_fname) as f: | |
outline = json.load(f) | |
pages = [page["object"] for page in j["pages"]] | |
next_object_id = j["qpdf"][0]["maxobjectid"] + 1 | |
ids = ObjectIdAllocator(next_object_id) | |
catalog = get_catalog(j) | |
outlines_id = ids.next_id() | |
outlines = insert_new_object(j, outlines_id) | |
outlines["/Type"] = "/Outlines" | |
bookmarks = [] | |
for item in outline: | |
bookmark = add_outline_item(j, pages, item, outlines_id, offset, ids) | |
bookmarks.append(bookmark) | |
for ((id, bookmark), (next_id, | |
next_bookmark)) in zip(bookmarks, bookmarks[1:]): | |
bookmark["/Next"] = f"{next_id} 0 R" | |
next_bookmark["/Prev"] = f"{id} 0 R" | |
catalog["/Outlines"] = f"{outlines_id} 0 R" | |
first_id = bookmarks[0][0] | |
outlines["/First"] = f"{first_id} 0 R" | |
last_id = bookmarks[-1][0] | |
outlines["/Last"] = f"{last_id} 0 R" | |
outlines["/Count"] = len(bookmarks) | |
return j | |
def get_catalog(j): | |
objs = j["qpdf"][1] | |
for (k, v) in objs.items(): | |
if not k.startswith("obj:"): | |
continue | |
if "value" not in v: | |
continue | |
v = v["value"] | |
if "/Type" not in v: | |
continue | |
if v["/Type"] == "/Catalog": | |
return v | |
raise Exception("could not find a PDF /Catalog") | |
def add_outline_item(j, pages, item, parent_id, offset: int, | |
ids: ObjectIdAllocator): | |
id = ids.next_id() | |
title = item["title"] | |
page_num = item["dest"] | |
page_ref = pages[page_num + offset] | |
bookmark = insert_new_object(j, id) | |
bookmark["/Dest"] = [page_ref, "/XYZ", None, None, None] | |
bookmark["/Parent"] = f"{parent_id} 0 R" | |
bookmark["/Title"] = f"u:{title}" | |
if "children" in item: | |
children = [] | |
for child in item["children"]: | |
bm = add_outline_item(j, pages, child, id, offset, ids) | |
children.append(bm) | |
for ((child_id, bm), (next_child_id, | |
next_bm)) in zip(children, children[1:]): | |
bm["/Next"] = f"{next_child_id} 0 R" | |
next_bm["/Prev"] = f"{child_id} 0 R" | |
first_id = children[0][0] | |
bookmark["/First"] = f"{first_id} 0 R" | |
last_id = children[-1][0] | |
bookmark["/Last"] = f"{last_id} 0 R" | |
bookmark["/Count"] = len(children) | |
return (id, bookmark) | |
def insert_new_object(j, id): | |
key = f"obj:{id} 0 R" | |
obj = {} | |
j["qpdf"][1][key] = obj | |
value = {} | |
obj["value"] = value | |
return value | |
class ObjectIdAllocator(): | |
def __init__(self, next_id: int): | |
self._next_id = next_id | |
def next_id(self): | |
id = self._next_id | |
self._next_id += 1 | |
return id | |
if __name__ == "__main__": | |
main() |
# Clean up PDF for ingestion | |
qpdf --decrypt --object-streams=disable original.pdf in.pdf | |
# Create JSON dump of relevant metadata | |
qpdf --json in.pdf in.json | |
# Create outline JSON | |
vim outline.json | |
# Or, alternatively, create the outline as an indented text file and convert it to JSON | |
vim outline.txt && ./text_to_json.py --increasing-page-numbers <outline.txt | jq --slurp '.' | sponge outline.txt | |
# Write outline data into JSON dump, overwriting old outline if any. | |
./rewrite-pdf-outline.py --json in.json --outline outline.json --input in.pdf >out.json | |
# Write output JSON data into final PDF. | |
qpdf in.pdf out.pdf --update-from-json=out.json |
#!/usr/bin/env python3 | |
import argparse | |
import collections | |
import itertools | |
import json | |
import re | |
import sys | |
def main(): | |
parser = argparse.ArgumentParser( | |
description="Convert a text outline to JSON") | |
parser.add_argument("--increasing-page-numbers", action="store_true") | |
args = parser.parse_args() | |
render_entries(read_entries(args.increasing_page_numbers)) | |
def read_entries(validate_increasing_page_numbers): | |
prev_page = None | |
indent_stack = initialize_indent_stack() | |
for (line_number, line) in enumerate(sys.stdin): | |
space, title, page = parse_line(line) | |
if validate_increasing_page_numbers and prev_page is not None and page < prev_page: | |
raise Exception(f"decreasing page number at line {line_number}") | |
update_indent_stack(space, indent_stack, line_number) | |
depth = len(indent_stack) | |
yield (depth, title, page) | |
prev_page = page | |
WHITESPACE = re.compile(r"\s+") | |
def parse_line(line): | |
line = line.rstrip() | |
title, page = line.rsplit(maxsplit=1) | |
m = WHITESPACE.match(title) | |
space = "" | |
if m: | |
space = m.group(0) | |
# Does Python use utf-32? | |
title = title[len(space):] | |
page = int(page) | |
return (space, title, page) | |
def initialize_indent_stack(): | |
return [] | |
def update_indent_stack(space, indent_stack, line_number): | |
# We don't care _which_ characters are used to indent as long as they are | |
# consistent at each level. | |
if len(space) == 0: | |
# Zero out any existing indentation. | |
indent_stack.clear() | |
elif len(indent_stack) == 0: | |
# We have non-empty leading space but an empty stack, so this is the | |
# first level of indentation. | |
indent_stack.append(space) | |
else: | |
# We have a non-empty indentation stack _and_ non-empty leading space. | |
# We need to confirm that there's some level of shared prefix with | |
# existing indentation. | |
last_indent = indent_stack[-1] | |
if len(space) > len(last_indent): | |
# Deeper level of indentation than before. This is only valid if it | |
# starts with the pevious indentation characters. | |
if space.startswith(last_indent): | |
indent_stack.append(space) | |
else: | |
raise Exception(f"invalid indentation at line {line_number}") | |
else: | |
# This must correspond to a _shallower_ level of indentation than | |
# before and, moreover, must _exactly_ match one of our previous | |
# indentation levels. (It's not valid to de-indent to some previous | |
# depth and then re-indent to a deeper level with new space | |
# characters). | |
if not last_indent.startswith(space): | |
raise Exception( | |
f"invalid de-indentation at line {line_number}") | |
while len(last_indent) != len(space): | |
# NOTE: We rely on preconditions here to avoid more assertions | |
# about stack state. | |
indent_stack.pop() | |
last_indent = indent_stack[-1] | |
def render_entries(entries): | |
for entry in build_entries(pair_with_next_depth(entries)): | |
render_entry(entry) | |
def pair_with_next_depth(entries): | |
for (entry, | |
next_entry) in sliding_window(itertools.chain(entries, (None, )), 2): | |
if next_entry is None: | |
yield (entry, None) | |
else: | |
yield (entry, next_entry[0]) | |
def sliding_window(iterator, n): | |
window = collections.deque(itertools.islice(iterator, n - 1), maxlen=n) | |
for x in iterator: | |
window.append(x) | |
yield tuple(window) | |
def build_entries(entries): | |
# Use an explicit stack frome so we can easily jump up multiple levels as | |
# needed to resume building ancestors after popping out of deeper levels. | |
stack = [] # (depth, entry_dict) tuples | |
def finalize_completed_entries(target_depth=None): | |
# Pop and finalize entries from stack that are at or deeper than | |
# target_depth. If target_depth is None (we're at the outermost frame), | |
# finalize all entries. | |
while stack and (target_depth is None or target_depth <= stack[-1][0]): | |
completed_depth, completed_entry = stack.pop() | |
if stack: | |
# Parent lives in the next-shallower frame. | |
parent_depth, parent_entry = stack[-1] | |
if "children" not in parent_entry: | |
parent_entry["children"] = [] | |
parent_entry["children"].append(completed_entry) | |
else: | |
# We're at the outer stack frame. Yield any remaining entries | |
# directly. | |
yield completed_entry | |
for ((depth, title, page), next_depth) in entries: | |
# If we've returned to a shallower depth, pop and yield any completed | |
# entries before continuing to process the current entry. | |
yield from finalize_completed_entries(depth) | |
entry = {"title": title, "dest": page} | |
stack.append((depth, entry)) | |
# Yield any remaining entries in stack | |
yield from finalize_completed_entries() | |
def render_entry(entry): | |
json.dump(entry, sys.stdout) | |
if __name__ == "__main__": | |
main() |
Do you have an example
outline.json
file you could share?
This works for me:
[
{
"title": "First chapter",
"dest": 0,
"children": [
{
"title": "Subsection one point one",
"dest": 1
},
{
"title": "Subsection one point two",
"dest": 2
}
]
},
{
"title": "Second chapter",
"dest": 3
}
]
Yes, the above outline should work. I've also just added a script to make it a bit easier to write outlines as plain text files with minimal structure. This should make it easy to directly type up outlines from TOCs, etc.
For example:
Contents 0
1. Chapter 1 1
1.1. Subchapter 1.1. 2
1.2 Subchapter 1.2 3
2. Chapter 2 4
2.1 Subchapter 2.1 5
Index 6
The last token per line is interpreted as the destination page (zero-indexed, as in the standard JSON format).
Since it renders to JSON, you can also do various JSON transformations (e.g., handling conversion of front-matter pagination and main body pagination to different offsets).
There seems to be a problem tough when the indent dephs drops from 2 to 0
For Example:
Contents 0
1. Chapter 1 1
1.1. Subchapter 1.1. 2
1.2 Subchapter 1.2 3
2. Chapter 2 4
2.1 Subchapter 2.1 5
2.1.1 Subsubchapter 6
Index 7
then the depth in the json drops only one level. I coundn't figure out where the error is, yet.
@Nighel123 I just tested out your sample and it works for me. Note that the base script spits out concatenated JSON objects, so you need to pass the --slurp
option to jq
if you want it to be in a format that Python will understand. I might update/rework this at some point to make it easier to use, because the ergonomics are poor.
It was missing a closing-quote before, but see https://gist.github.com/bsidhom/dae50ecc0062a7a1202469860c8eea89#file-rewrite-pdf-outline-sh-L8.
I spoke too soon. It comes out as valid JSON, but I'm guessing the unnesting doesn't happen when there's more than one step. Looking into it now.
@Nighel123 I believe it's fixed now. The issue was indeed that partially-completed ancestors weren't being finalized correctly. I've switched to an explicit stack which makes this a bit more transparent and easier to manage.
Do you have an example
outline.json
file you could share?