Skip to content

Instantly share code, notes, and snippets.

@AllanChain
Created June 13, 2021 03:59
Show Gist options
  • Save AllanChain/7f08f89470acea2d97ac30226e9638d7 to your computer and use it in GitHub Desktop.
Save AllanChain/7f08f89470acea2d97ac30226e9638d7 to your computer and use it in GitHub Desktop.
提取 人类生存发展与核科学 课件中的文字

这是简单地提取 人类生存发展与核科学 课件中文字的脚本。

适用于已经有笔记但担心考一些课件里边角的内容有觉得把课件全打下来太浪费的同学。

from pathlib import Path
import re
import fitz
ROOT = Path(__file__).parent
text = ""
def exclude(text: str) -> bool:
for s in ["<", "Int J Radiat Biol"]:
if text.startswith(s):
return True
for pdf in sorted(
ROOT.glob("第*.pdf"), key=lambda pdf: int(re.findall("(\d+)", pdf.name)[0])
):
section_name = re.findall(r"(?:-| )(.+)\.pdf$", pdf.name)[0]
text += f"\\section{{{section_name}}}\n"
doc = fitz.open(pdf)
for page in doc:
for block in page.get_text_blocks():
block_text = block[4]
if exclude(block_text): # image
continue
text += block_text
# Very short block: similar to <span>
# Relatively long without \n: a line of sentence
if len(block_text) < 5 or len(block_text) > 23 and "\n" not in block_text[:-1]:
text += " "
else:
text += "\n"
with open(ROOT / "content.txt", "w", encoding="utf8") as f:
f.write(text)
\documentclass[twocolumn, 10pt]{ctexart}
\usepackage[a4paper, margin=0.5in]{geometry}
\begin{document}
\input{content.txt}
\end{document}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment