aqzlpm11 · October 6, 2018 16:39
diff --git a/add_pdf_bookmarks.py b/add_pdf_bookmarks.py
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-
 # Origin: https://github.com/RussellLuo/pdfbookmarker
 import sys
 import os
 import re

 from PyPDF2 import PdfFileMerger, PdfFileReader

 def addBookmarks(pdf_in_filename, bookmarks_tree, pdf_out_filename=None):
    """Add bookmarks to existing PDF files
    Home:
        https://github.com/RussellLuo/pdfbookmarker
    Some useful references:
        [1] http://pybrary.net/pyPdf/
        [2] http://stackoverflow.com/questions/18855907/adding-bookmarks-using-pypdf2
        [3] http://stackoverflow.com/questions/3009935/looking-for-a-good-python-tree-data-structure
    """
    pdf_out = PdfFileMerger()

    pdf_out.append(pdf_in_filename, import_bookmarks=False)
    # copy/preserve existing metainfo
    pdf_in = PdfFileReader(pdf_in_filename)
    metaInfo = pdf_in.getDocumentInfo()
    if metaInfo:
        pdf_out.addMetadata(metaInfo)

    def crawl_tree(tree, parent):
        for title, pagenum, subtree in tree:
            current = pdf_out.addBookmark(title, pagenum, parent) # add parent bookmark
            if subtree:
                crawl_tree(subtree, current)

    # add bookmarks into `pdf_out` by crawling `bookmarks_tree`
    crawl_tree(bookmarks_tree, None)

    # get `pdf_out_filename` if it's not specified
    if not pdf_out_filename:
        name_parts = os.path.splitext(pdf_in_filename)
        pdf_out_filename = name_parts[0] + '-new' + name_parts[1]

    pdf_out.write(pdf_out_filename)

 def get_bookmarks_tree(bookmarks_text):
    """Get bookmarks tree from TEXT-format file
    Bookmarks tree structure:
        >>> get_bookmarks_tree('sample_bookmarks.txt')
        [(u'Foreword', 0, []), (u'Chapter 1: Introduction', 1, [(u'1.1 Python', 1, [(u'1.1.1 Basic syntax', 1, []), (u'1.1.2 Hello world', 2, [])]), (u'1.2 Exercises', 3, [])]), (u'Chapter 2: Conclusion', 4, [])]
    The above test result may be more readable in the following format:
        [
            (u'Foreword', 0, []),
            (u'Chapter 1: Introduction', 1,
                [
                    (u'1.1 Python', 1,
                        [
                            (u'1.1.1 Basic syntax', 1, []),
                            (u'1.1.2 Hello world', 2, [])
                        ]
                    ),
                    (u'1.2 Exercises', 3, [])
                ]
            ),
            (u'Chapter 2: Conclusion', 4, [])
        ]
    Thanks Stefan, who share us a perfect solution for Python tree.
    See http://stackoverflow.com/questions/3009935/looking-for-a-good-python-tree-data-structure
    Since dictionary in Python is unordered, I use list instead now.
    Also thanks Caicono, who inspiring me that it's not a bad idea to record bookmark titles and page numbers by hand.
    See here: http://www.caicono.cn/wordpress/2010/01/%E6%80%9D%E8%80%83%E5%85%85%E5%88%86%E5%86%8D%E8%A1%8C%E5%8A%A8-python%E8%AF%95%E6%B0%B4%E8%AE%B0.html
    And I think it's the only solution for scan version PDFs to be processed automatically.
    """

    # bookmarks tree
    tree = []

    # the latest nodes (the old node will be replaced by a new one if they have the same level)
    # 
    # each item (key, value) in dictionary represents a node
    # `key`: the level of the node
    # `value`: the children list of the node
    latest_nodes = {0: tree}

    prev_level = 0
    assert(type(bookmarks_text) == list)
    for line in bookmarks_text:
        # res = re.match(r'(\+*)\s*?"([^"]+)"\s*\|\s*(\d+)', line.strip())
        res = re.match(r'([ ]*)(.*?)\s*(\d+)(-\d+){0,1}\s*$', line)
        if res:
            pluses, title, pagenum, _ = res.groups()
            
            cur_level = len(pluses)/4+1 # plus count stands for level
            cur_node = (title, int(pagenum) - 1, [])
            
            # print(cur_level, title, pagenum)

            if not (cur_level > 0 and cur_level <= prev_level + 1):
                raise Exception('plus (+) count is invalid here: %s' % line.strip())
            else:
                # append the current node into its parent node (with the level `cur_level` - 1)
                latest_nodes[cur_level - 1].append(cur_node)

            latest_nodes[cur_level] = cur_node[2]
            prev_level = cur_level

    return tree

 # run as a script
 def run_script(pdf_in_filename, bookmarks_text, pdf_out_filename=None):
    sys.stderr.write('processing, please wait ...')
    try:
        bookmarks_tree = get_bookmarks_tree(bookmarks_text)
        addBookmarks(pdf_in_filename, bookmarks_tree, pdf_out_filename)
    except Exception as e:
        sys.stderr.write("failed:         \n  %s\n" % str(e))
    else:
        sys.stderr.write("done!           \n")

 bookmark_text = """
 摘要  5-7
 主要缩写对照表 17-19
 第一章 绪论  19-34
    1.1 数字音频盲取证研究意义 19-23
    1.2 数字音频盲取证研究现状 23-31
        1.2.1 数字音频盲取证研究机构   23
        1.2.2 数字音频盲取证研究现状   23-31
    1.3 论文主要工作与章节安排 31-34
 结论  130-134
    研究总结    130-132
    后续工作展望  132-134
 参考文献    134-145
 攻读博士学位期间取得的研究成果 145-147
 致谢  147-149
 附件  149
 """

 run_script('a.pdf', bookmark_text.split('\n'))
	#!/usr/bin/env python
	# -- coding: utf-8 --
	# Origin: https://github.com/RussellLuo/pdfbookmarker
	import sys
	import os
	import re

	from PyPDF2 import PdfFileMerger, PdfFileReader

	def addBookmarks(pdf_in_filename, bookmarks_tree, pdf_out_filename=None):
	"""Add bookmarks to existing PDF files
	Home:
	https://github.com/RussellLuo/pdfbookmarker
	Some useful references:
	[1] http://pybrary.net/pyPdf/
	[2] http://stackoverflow.com/questions/18855907/adding-bookmarks-using-pypdf2
	[3] http://stackoverflow.com/questions/3009935/looking-for-a-good-python-tree-data-structure
	"""
	pdf_out = PdfFileMerger()

	pdf_out.append(pdf_in_filename, import_bookmarks=False)
	# copy/preserve existing metainfo
	pdf_in = PdfFileReader(pdf_in_filename)
	metaInfo = pdf_in.getDocumentInfo()
	if metaInfo:
	pdf_out.addMetadata(metaInfo)

	def crawl_tree(tree, parent):
	for title, pagenum, subtree in tree:
	current = pdf_out.addBookmark(title, pagenum, parent) # add parent bookmark
	if subtree:
	crawl_tree(subtree, current)

	# add bookmarks into `pdf_out` by crawling `bookmarks_tree`
	crawl_tree(bookmarks_tree, None)

	# get `pdf_out_filename` if it's not specified
	if not pdf_out_filename:
	name_parts = os.path.splitext(pdf_in_filename)
	pdf_out_filename = name_parts[0] + '-new' + name_parts[1]

	pdf_out.write(pdf_out_filename)

	def get_bookmarks_tree(bookmarks_text):
	"""Get bookmarks tree from TEXT-format file
	Bookmarks tree structure:
	>>> get_bookmarks_tree('sample_bookmarks.txt')
	[(u'Foreword', 0, []), (u'Chapter 1: Introduction', 1, [(u'1.1 Python', 1, [(u'1.1.1 Basic syntax', 1, []), (u'1.1.2 Hello world', 2, [])]), (u'1.2 Exercises', 3, [])]), (u'Chapter 2: Conclusion', 4, [])]
	The above test result may be more readable in the following format:
	[
	(u'Foreword', 0, []),
	(u'Chapter 1: Introduction', 1,
	[
	(u'1.1 Python', 1,
	[
	(u'1.1.1 Basic syntax', 1, []),
	(u'1.1.2 Hello world', 2, [])
	]
	),
	(u'1.2 Exercises', 3, [])
	]
	),
	(u'Chapter 2: Conclusion', 4, [])
	]
	Thanks Stefan, who share us a perfect solution for Python tree.
	See http://stackoverflow.com/questions/3009935/looking-for-a-good-python-tree-data-structure
	Since dictionary in Python is unordered, I use list instead now.
	Also thanks Caicono, who inspiring me that it's not a bad idea to record bookmark titles and page numbers by hand.
	See here: http://www.caicono.cn/wordpress/2010/01/%E6%80%9D%E8%80%83%E5%85%85%E5%88%86%E5%86%8D%E8%A1%8C%E5%8A%A8-python%E8%AF%95%E6%B0%B4%E8%AE%B0.html
	And I think it's the only solution for scan version PDFs to be processed automatically.
	"""

	# bookmarks tree
	tree = []

	# the latest nodes (the old node will be replaced by a new one if they have the same level)
	#
	# each item (key, value) in dictionary represents a node
	# `key`: the level of the node
	# `value`: the children list of the node
	latest_nodes = {0: tree}

	prev_level = 0
	assert(type(bookmarks_text) == list)
	for line in bookmarks_text:
	# res = re.match(r'(\+)\s?"([^"]+)"\s\\|\s(\d+)', line.strip())
	res = re.match(r'([ ])(.?)\s(\d+)(-\d+){0,1}\s$', line)
	if res:
	pluses, title, pagenum, _ = res.groups()

	cur_level = len(pluses)/4+1 # plus count stands for level
	cur_node = (title, int(pagenum) - 1, [])

	# print(cur_level, title, pagenum)

	if not (cur_level > 0 and cur_level <= prev_level + 1):
	raise Exception('plus (+) count is invalid here: %s' % line.strip())
	else:
	# append the current node into its parent node (with the level `cur_level` - 1)
	latest_nodes[cur_level - 1].append(cur_node)

	latest_nodes[cur_level] = cur_node[2]
	prev_level = cur_level

	return tree

	# run as a script
	def run_script(pdf_in_filename, bookmarks_text, pdf_out_filename=None):
	sys.stderr.write('processing, please wait ...')
	try:
	bookmarks_tree = get_bookmarks_tree(bookmarks_text)
	addBookmarks(pdf_in_filename, bookmarks_tree, pdf_out_filename)
	except Exception as e:
	sys.stderr.write("failed: \n %s\n" % str(e))
	else:
	sys.stderr.write("done! \n")

	bookmark_text = """
	摘要 5-7
	主要缩写对照表 17-19
	第一章绪论 19-34
	1.1 数字音频盲取证研究意义 19-23
	1.2 数字音频盲取证研究现状 23-31
	1.2.1 数字音频盲取证研究机构 23
	1.2.2 数字音频盲取证研究现状 23-31
	1.3 论文主要工作与章节安排 31-34
	结论 130-134
	研究总结 130-132
	后续工作展望 132-134
	参考文献 134-145
	攻读博士学位期间取得的研究成果 145-147
	致谢 147-149
	附件 149
	"""

	run_script('a.pdf', bookmark_text.split('\n'))