Last active
October 6, 2018 16:39
-
-
Save aqzlpm11/1f93585d9604763e9197fd686c5f59e2 to your computer and use it in GitHub Desktop.
cnki add pdf bookmarks
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# -*- coding: utf-8 -*- | |
# Origin: https://github.com/RussellLuo/pdfbookmarker | |
import sys | |
import os | |
import re | |
from PyPDF2 import PdfFileMerger, PdfFileReader | |
def addBookmarks(pdf_in_filename, bookmarks_tree, pdf_out_filename=None): | |
"""Add bookmarks to existing PDF files | |
Home: | |
https://github.com/RussellLuo/pdfbookmarker | |
Some useful references: | |
[1] http://pybrary.net/pyPdf/ | |
[2] http://stackoverflow.com/questions/18855907/adding-bookmarks-using-pypdf2 | |
[3] http://stackoverflow.com/questions/3009935/looking-for-a-good-python-tree-data-structure | |
""" | |
pdf_out = PdfFileMerger() | |
pdf_out.append(pdf_in_filename, import_bookmarks=False) | |
# copy/preserve existing metainfo | |
pdf_in = PdfFileReader(pdf_in_filename) | |
metaInfo = pdf_in.getDocumentInfo() | |
if metaInfo: | |
pdf_out.addMetadata(metaInfo) | |
def crawl_tree(tree, parent): | |
for title, pagenum, subtree in tree: | |
current = pdf_out.addBookmark(title, pagenum, parent) # add parent bookmark | |
if subtree: | |
crawl_tree(subtree, current) | |
# add bookmarks into `pdf_out` by crawling `bookmarks_tree` | |
crawl_tree(bookmarks_tree, None) | |
# get `pdf_out_filename` if it's not specified | |
if not pdf_out_filename: | |
name_parts = os.path.splitext(pdf_in_filename) | |
pdf_out_filename = name_parts[0] + '-new' + name_parts[1] | |
pdf_out.write(pdf_out_filename) | |
def get_bookmarks_tree(bookmarks_text): | |
"""Get bookmarks tree from TEXT-format file | |
Bookmarks tree structure: | |
>>> get_bookmarks_tree('sample_bookmarks.txt') | |
[(u'Foreword', 0, []), (u'Chapter 1: Introduction', 1, [(u'1.1 Python', 1, [(u'1.1.1 Basic syntax', 1, []), (u'1.1.2 Hello world', 2, [])]), (u'1.2 Exercises', 3, [])]), (u'Chapter 2: Conclusion', 4, [])] | |
The above test result may be more readable in the following format: | |
[ | |
(u'Foreword', 0, []), | |
(u'Chapter 1: Introduction', 1, | |
[ | |
(u'1.1 Python', 1, | |
[ | |
(u'1.1.1 Basic syntax', 1, []), | |
(u'1.1.2 Hello world', 2, []) | |
] | |
), | |
(u'1.2 Exercises', 3, []) | |
] | |
), | |
(u'Chapter 2: Conclusion', 4, []) | |
] | |
Thanks Stefan, who share us a perfect solution for Python tree. | |
See http://stackoverflow.com/questions/3009935/looking-for-a-good-python-tree-data-structure | |
Since dictionary in Python is unordered, I use list instead now. | |
Also thanks Caicono, who inspiring me that it's not a bad idea to record bookmark titles and page numbers by hand. | |
See here: http://www.caicono.cn/wordpress/2010/01/%E6%80%9D%E8%80%83%E5%85%85%E5%88%86%E5%86%8D%E8%A1%8C%E5%8A%A8-python%E8%AF%95%E6%B0%B4%E8%AE%B0.html | |
And I think it's the only solution for scan version PDFs to be processed automatically. | |
""" | |
# bookmarks tree | |
tree = [] | |
# the latest nodes (the old node will be replaced by a new one if they have the same level) | |
# | |
# each item (key, value) in dictionary represents a node | |
# `key`: the level of the node | |
# `value`: the children list of the node | |
latest_nodes = {0: tree} | |
prev_level = 0 | |
assert(type(bookmarks_text) == list) | |
for line in bookmarks_text: | |
# res = re.match(r'(\+*)\s*?"([^"]+)"\s*\|\s*(\d+)', line.strip()) | |
res = re.match(r'([ ]*)(.*?)\s*(\d+)(-\d+){0,1}\s*$', line) | |
if res: | |
pluses, title, pagenum, _ = res.groups() | |
cur_level = len(pluses)/4+1 # plus count stands for level | |
cur_node = (title, int(pagenum) - 1, []) | |
# print(cur_level, title, pagenum) | |
if not (cur_level > 0 and cur_level <= prev_level + 1): | |
raise Exception('plus (+) count is invalid here: %s' % line.strip()) | |
else: | |
# append the current node into its parent node (with the level `cur_level` - 1) | |
latest_nodes[cur_level - 1].append(cur_node) | |
latest_nodes[cur_level] = cur_node[2] | |
prev_level = cur_level | |
return tree | |
# run as a script | |
def run_script(pdf_in_filename, bookmarks_text, pdf_out_filename=None): | |
sys.stderr.write('processing, please wait ...') | |
try: | |
bookmarks_tree = get_bookmarks_tree(bookmarks_text) | |
addBookmarks(pdf_in_filename, bookmarks_tree, pdf_out_filename) | |
except Exception as e: | |
sys.stderr.write("failed: \n %s\n" % str(e)) | |
else: | |
sys.stderr.write("done! \n") | |
bookmark_text = """ | |
摘要 5-7 | |
主要缩写对照表 17-19 | |
第一章 绪论 19-34 | |
1.1 数字音频盲取证研究意义 19-23 | |
1.2 数字音频盲取证研究现状 23-31 | |
1.2.1 数字音频盲取证研究机构 23 | |
1.2.2 数字音频盲取证研究现状 23-31 | |
1.3 论文主要工作与章节安排 31-34 | |
结论 130-134 | |
研究总结 130-132 | |
后续工作展望 132-134 | |
参考文献 134-145 | |
攻读博士学位期间取得的研究成果 145-147 | |
致谢 147-149 | |
附件 149 | |
""" | |
run_script('a.pdf', bookmark_text.split('\n')) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment