Skip to content

Instantly share code, notes, and snippets.

@aqzlpm11
Last active October 6, 2018 16:39
Show Gist options
  • Save aqzlpm11/1f93585d9604763e9197fd686c5f59e2 to your computer and use it in GitHub Desktop.
Save aqzlpm11/1f93585d9604763e9197fd686c5f59e2 to your computer and use it in GitHub Desktop.
cnki add pdf bookmarks
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# Origin: https://github.com/RussellLuo/pdfbookmarker
import sys
import os
import re
from PyPDF2 import PdfFileMerger, PdfFileReader
def addBookmarks(pdf_in_filename, bookmarks_tree, pdf_out_filename=None):
"""Add bookmarks to existing PDF files
Home:
https://github.com/RussellLuo/pdfbookmarker
Some useful references:
[1] http://pybrary.net/pyPdf/
[2] http://stackoverflow.com/questions/18855907/adding-bookmarks-using-pypdf2
[3] http://stackoverflow.com/questions/3009935/looking-for-a-good-python-tree-data-structure
"""
pdf_out = PdfFileMerger()
pdf_out.append(pdf_in_filename, import_bookmarks=False)
# copy/preserve existing metainfo
pdf_in = PdfFileReader(pdf_in_filename)
metaInfo = pdf_in.getDocumentInfo()
if metaInfo:
pdf_out.addMetadata(metaInfo)
def crawl_tree(tree, parent):
for title, pagenum, subtree in tree:
current = pdf_out.addBookmark(title, pagenum, parent) # add parent bookmark
if subtree:
crawl_tree(subtree, current)
# add bookmarks into `pdf_out` by crawling `bookmarks_tree`
crawl_tree(bookmarks_tree, None)
# get `pdf_out_filename` if it's not specified
if not pdf_out_filename:
name_parts = os.path.splitext(pdf_in_filename)
pdf_out_filename = name_parts[0] + '-new' + name_parts[1]
pdf_out.write(pdf_out_filename)
def get_bookmarks_tree(bookmarks_text):
"""Get bookmarks tree from TEXT-format file
Bookmarks tree structure:
>>> get_bookmarks_tree('sample_bookmarks.txt')
[(u'Foreword', 0, []), (u'Chapter 1: Introduction', 1, [(u'1.1 Python', 1, [(u'1.1.1 Basic syntax', 1, []), (u'1.1.2 Hello world', 2, [])]), (u'1.2 Exercises', 3, [])]), (u'Chapter 2: Conclusion', 4, [])]
The above test result may be more readable in the following format:
[
(u'Foreword', 0, []),
(u'Chapter 1: Introduction', 1,
[
(u'1.1 Python', 1,
[
(u'1.1.1 Basic syntax', 1, []),
(u'1.1.2 Hello world', 2, [])
]
),
(u'1.2 Exercises', 3, [])
]
),
(u'Chapter 2: Conclusion', 4, [])
]
Thanks Stefan, who share us a perfect solution for Python tree.
See http://stackoverflow.com/questions/3009935/looking-for-a-good-python-tree-data-structure
Since dictionary in Python is unordered, I use list instead now.
Also thanks Caicono, who inspiring me that it's not a bad idea to record bookmark titles and page numbers by hand.
See here: http://www.caicono.cn/wordpress/2010/01/%E6%80%9D%E8%80%83%E5%85%85%E5%88%86%E5%86%8D%E8%A1%8C%E5%8A%A8-python%E8%AF%95%E6%B0%B4%E8%AE%B0.html
And I think it's the only solution for scan version PDFs to be processed automatically.
"""
# bookmarks tree
tree = []
# the latest nodes (the old node will be replaced by a new one if they have the same level)
#
# each item (key, value) in dictionary represents a node
# `key`: the level of the node
# `value`: the children list of the node
latest_nodes = {0: tree}
prev_level = 0
assert(type(bookmarks_text) == list)
for line in bookmarks_text:
# res = re.match(r'(\+*)\s*?"([^"]+)"\s*\|\s*(\d+)', line.strip())
res = re.match(r'([ ]*)(.*?)\s*(\d+)(-\d+){0,1}\s*$', line)
if res:
pluses, title, pagenum, _ = res.groups()
cur_level = len(pluses)/4+1 # plus count stands for level
cur_node = (title, int(pagenum) - 1, [])
# print(cur_level, title, pagenum)
if not (cur_level > 0 and cur_level <= prev_level + 1):
raise Exception('plus (+) count is invalid here: %s' % line.strip())
else:
# append the current node into its parent node (with the level `cur_level` - 1)
latest_nodes[cur_level - 1].append(cur_node)
latest_nodes[cur_level] = cur_node[2]
prev_level = cur_level
return tree
# run as a script
def run_script(pdf_in_filename, bookmarks_text, pdf_out_filename=None):
sys.stderr.write('processing, please wait ...')
try:
bookmarks_tree = get_bookmarks_tree(bookmarks_text)
addBookmarks(pdf_in_filename, bookmarks_tree, pdf_out_filename)
except Exception as e:
sys.stderr.write("failed: \n %s\n" % str(e))
else:
sys.stderr.write("done! \n")
bookmark_text = """
摘要 5-7
主要缩写对照表 17-19
第一章 绪论 19-34
1.1 数字音频盲取证研究意义 19-23
1.2 数字音频盲取证研究现状 23-31
1.2.1 数字音频盲取证研究机构 23
1.2.2 数字音频盲取证研究现状 23-31
1.3 论文主要工作与章节安排 31-34
结论 130-134
研究总结 130-132
后续工作展望 132-134
参考文献 134-145
攻读博士学位期间取得的研究成果 145-147
致谢 147-149
附件 149
"""
run_script('a.pdf', bookmark_text.split('\n'))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment