Last active
September 16, 2019 11:07
-
-
Save kspeeckaert/b51e029af2f92e05b4e7ca1700d341ad to your computer and use it in GitHub Desktop.
Extract the table of contents from a PDF file and save it as an OPML, e.g. for import into a mind map
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Requirements | |
# yattag==1.12.2 | |
# PyPDF2==1.26.0 | |
# Tested with Python 3.7.4 on macOS | |
import sys | |
from pathlib import Path | |
from PyPDF2 import PdfFileReader | |
from yattag import Doc | |
from yattag import indent | |
def dump_outline(doc, outline): | |
# This is a rather convoluted function, due to the way | |
# PyPDF2 returns the PDF outline. | |
# If the parent node is located at position i, the child | |
# nodes are stored as a list in position i+1 | |
curr_idx = 0 | |
while curr_idx < len(outline): | |
try: | |
if isinstance(outline[curr_idx +1], list): | |
with doc.tag('outline', text=outline[curr_idx].title): | |
dump_outline(doc, outline[curr_idx+1]) | |
# Jump over child nodes | |
curr_idx += 2 | |
continue | |
except IndexError: | |
pass | |
# Use a self-closing tag, there are no child nodes | |
doc.stag('outline', text=outline[curr_idx].title) | |
curr_idx += 1 | |
def main(pdf_filename): | |
try: | |
input_file = Path(pdf_filename) | |
# The OPML file is saved in the same location as the PDF file | |
output_file = input_file.with_suffix('.opml') | |
pdf_doc = PdfFileReader(open(input_file, 'rb')) | |
doc, tag, text, line = Doc().ttl() | |
doc.asis('<?xml version="1.0" encoding="UTF-8"?>') | |
with tag('opml', version='1.0'): | |
with tag('head'): | |
# If the document has no title, we'll use the | |
# PDF filename (without extension) | |
title = pdf_doc.getDocumentInfo().title | |
if title is None: | |
title = input_file.stem | |
line('title', title) | |
with tag('body'): | |
dump_outline(doc, pdf_doc.outlines) | |
with open(output_file, 'w') as f: | |
f.write(indent(doc.getvalue())) | |
except Exception as e: | |
print(f'Error: {e!r}') | |
if __name__ == '__main__': | |
# The PDF filename (incl path) is passed as an argument on the command line | |
main(sys.argv[1]) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment