Last active
December 21, 2021 14:40
-
-
Save jerrylususu/29aec04bca3fe797a78e9261095287fe to your computer and use it in GitHub Desktop.
《凤凰架构》PDF 增加目录
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import json | |
import toml | |
from pdftocgen.recipe import extract_toc, Recipe | |
from fitzutils import ToCEntry, dump_toc, open_pdf | |
from pdftocio.tocio import write_toc | |
# requiremnets | |
# ---------------------------- | |
# ruquire `pdf.tocgen` | |
# https://github.com/Krasjet/pdf.tocgen | |
# pip install -U pdf.tocgen | |
# usage | |
# ---------------------------- | |
# 1. make sure `pdf.tocgen` is installed | |
# 2. put `exportPages.json`, `sidebar.json`, `the-fenix-project.pdf` into the folder containing the script | |
# 3. run the script | |
# 4. 2 files will be generated: `the-fenix-project-with-toc.pdf` and `toc.txt` | |
# to apply the toc, use `pdftocio {path_to_pdf} < toc.txt` | |
# prepare `exportPages.json` & `sidebar.json` | |
# | |
# add the following line after line 93 in `.vuepress/plugins/export/index.js` | |
# replace {path_to_folder} with the path to the folder containing the script! | |
# | |
# fs.writeFileSync("{path_to_folder}" + "/sidebar.json", JSON.stringify(sidebar)); | |
# fs.writeFileSync("{path_to_folder}" + "/exportPages.json", JSON.stringify(exportPages)); | |
# return | |
# constants & paths | |
# ---------------------------- | |
pdf_path = "the-fenix-project.pdf" | |
toc_pdf_path = "the-fenix-project-with-toc.pdf" | |
final_toc_path = "toc.txt" | |
recipe_str = """[[heading]] | |
# 前端工程 | |
level = 1 | |
greedy = true | |
font.size = 26.411094665527344 | |
font.size_tolerance = 1""" | |
# helpers | |
# ----------------------------- | |
def remove_multiple_suffix(s, suffixes): | |
triggered = True | |
while triggered: | |
triggered = False | |
for suffix in suffixes: | |
if s.endswith(suffix): | |
triggered = True | |
s = s[:-len(suffix)] | |
break | |
return s | |
def normalize_path(path): | |
path = path.lower() | |
path = remove_multiple_suffix(path, ["/", ".md", ".html"]) | |
return path | |
def walk_tree(item, level, path_title_level_list): | |
if isinstance(item, list): | |
for subitem in item: | |
walk_tree(subitem, level+1,path_title_level_list) | |
elif isinstance(item, dict): | |
# print(item) | |
if "path" in item: | |
path_title_level_list.append([item["path"], item["title"], level]) | |
else: | |
path_title_level_list.append([None, item["title"], level]) | |
if "children" in item: | |
for subitem in item["children"]: | |
walk_tree(subitem, level+1,path_title_level_list) | |
elif isinstance(item, str): | |
path_title_level_list.append([item, None, level]) | |
# steps | |
# ----------------------------- | |
def generate_hierarchy(): | |
# load url & title | |
with open("exportPages.json", "r", encoding="u8") as f: | |
export_pages = json.load(f) | |
export_pages[0]["title"] = "" | |
url_to_title = {normalize_path(page["url"]):page["title"] for page in export_pages} | |
# load sidebar (for hierarchy) | |
with open("sidebar.json", "r", encoding="u8") as f: | |
sidebar = json.load(f) | |
path_title_level_list = [] | |
walk_tree(sidebar, 0, path_title_level_list) | |
# find title for childrens in sidebar | |
for idx, (path, title, level) in enumerate(path_title_level_list): | |
if title is None: | |
url = normalize_path(path) | |
title = url_to_title[url] | |
path_title_level_list[idx][1] = title | |
print("load from website, length", len(path_title_level_list)) | |
return path_title_level_list | |
def find_title_pages(): | |
recipe = toml.loads(recipe_str) | |
with open_pdf(pdf_path) as doc: | |
toc = extract_toc(doc, Recipe(recipe)) | |
print("load from pdf, length", len(toc)) | |
return toc | |
def check_toc_length(path_title_level_list, toc): | |
if len(toc) != len([i for i in path_title_level_list if i[0] is not None]): | |
print("WARNING: missing some chapters, the PDF provided might not be the most up-to-date version.") | |
print("警告:部分存在于网站中的章节不存在于 PDF 中。这可能是因为 PDF 构建后网站中增加了新章节。重新构建 PDF 可以解决这一问题。") | |
def build_final_toc(path_title_level_list, toc): | |
idx1, idx2 = 0, 0 | |
last_page_num = 1 | |
final_toc = [] | |
for idx1, (path, title, level) in enumerate(path_title_level_list): | |
title_match = "".join(title.split()) == "".join(toc[idx2].title.split()) | |
if path is None or (path is not None or title_match): | |
final_toc.append(ToCEntry(level, title, toc[idx2].pagenum)) | |
if path is not None: | |
if title_match: | |
idx2 += 1 | |
else: | |
print("missing chapter: ", title) | |
final_toc.pop() # remove missing ones | |
return final_toc | |
def save_toc(final_toc): | |
with open_pdf(pdf_path) as doc: | |
write_toc(doc, final_toc) | |
doc.save(toc_pdf_path) | |
with open(final_toc_path, "w") as f: | |
f.write(dump_toc(final_toc)) | |
# main | |
# ----------------------------- | |
def main(): | |
path_title_level_list = generate_hierarchy() | |
toc = find_title_pages() | |
check_toc_length(path_title_level_list, toc) | |
final_toc = build_final_toc(path_title_level_list, toc) | |
save_toc(final_toc) | |
if __name__ == "__main__": | |
main() | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
使用:
pdf.tocgen
依赖.vuepress\plugins\export\index.js
,在 line 93 (exportPages
定义结束,browser
定义开始前) 插入以下内容,以生成层级数据文件注意需要将
{path_to_folder}
换成 1 中工作目录的路径.vuepress/dist/pdf
下,文件名为the-fenix-project.pdf
)复制到 1 中的临时工作目录。此时 1 中目录应包含sidebar.json
,exportPages.json
,the-fenix-project.pdf
三个文件generate_pdf_with_toc.py
脚本,放置于 1 中工作目录the-fenix-project-with-toc.pdf
为带目录的 PDF 文件,toc.txt
为目录描述文件。只要有了目录描述文件,就可以直接在有原始 PDF 的情况下添加目录,而不需要再完成以上的步骤了,链接如果不想自己生成 PDF,只是想要自己生成目录的话,只需要做如下改动
return
,提前结束函数跳过 PDF 生成,但依然生成必要的层级数据文件但这样可能会有一些小问题:如果官网比 PDF 新(新在这里指文章增加),用官网的文件结构生成的 PDF 目录会包含实际 PDF 文件中没有的项,在脚本中会自动跳过,但是依然会提示。
生成原理:(理论上适合于任何 vuepress 项目)
vuepress
中的sidebar
获得层级结构,exportPages
获得页面标题和 URLpdf.tocgen
中的extract_toc
从 PDF 文件中提取出文章标题和对应的页码