这是简单地提取 人类生存发展与核科学 课件中文字的脚本。
适用于已经有笔记但担心考一些课件里边角的内容有觉得把课件全打下来太浪费的同学。
from pathlib import Path | |
import re | |
import fitz | |
ROOT = Path(__file__).parent | |
text = "" | |
def exclude(text: str) -> bool: | |
for s in ["<", "Int J Radiat Biol"]: | |
if text.startswith(s): | |
return True | |
for pdf in sorted( | |
ROOT.glob("第*.pdf"), key=lambda pdf: int(re.findall("(\d+)", pdf.name)[0]) | |
): | |
section_name = re.findall(r"(?:-| )(.+)\.pdf$", pdf.name)[0] | |
text += f"\\section{{{section_name}}}\n" | |
doc = fitz.open(pdf) | |
for page in doc: | |
for block in page.get_text_blocks(): | |
block_text = block[4] | |
if exclude(block_text): # image | |
continue | |
text += block_text | |
# Very short block: similar to <span> | |
# Relatively long without \n: a line of sentence | |
if len(block_text) < 5 or len(block_text) > 23 and "\n" not in block_text[:-1]: | |
text += " " | |
else: | |
text += "\n" | |
with open(ROOT / "content.txt", "w", encoding="utf8") as f: | |
f.write(text) |
\documentclass[twocolumn, 10pt]{ctexart} | |
\usepackage[a4paper, margin=0.5in]{geometry} | |
\begin{document} | |
\input{content.txt} | |
\end{document} |