Created
April 2, 2012 03:27
-
-
Save sakti/2280363 to your computer and use it in GitHub Desktop.
Extract TOC information from pdf file using pdfminer
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# parse_toc.py | |
from pdfminer.pdfparser import PDFParser, PDFDocument | |
def parse(filename, maxlevel): | |
fp = open(filename, 'rb') | |
parser = PDFParser(fp) | |
doc = PDFDocument() | |
parser.set_document(doc) | |
doc.set_parser(parser) | |
outlines = doc.get_outlines() | |
for (level, title, dest, a, se) in outlines: | |
if level <= maxlevel: | |
print ' ' * level, title | |
if __name__ == '__main__': | |
import sys | |
if len(sys.argv) != 3: | |
print 'Usage: %s xxx.pdf level' % sys.argv[0] | |
sys.exit(2) | |
parse(sys.argv[1], int(sys.argv[2])) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Hi sakti,
I adapted your gist to PDFMiner 20140328 here:
https://gist.github.com/tilusnet/407cd845a6b1cb939b34
Feel free to merge back, cheers!