AllanChain · June 13, 2021 03:59
diff --git a/README.md b/README.md
diff --git a/pdf_transformer.py b/pdf_transformer.py
 from pathlib import Path
 import re

 import fitz

 ROOT = Path(__file__).parent
 text = ""

 def exclude(text: str) -> bool:
    for s in ["<", "Int J Radiat Biol"]:
        if text.startswith(s):
            return True


 for pdf in sorted(
    ROOT.glob("第*.pdf"), key=lambda pdf: int(re.findall("(\d+)", pdf.name)[0])
 ):
    section_name = re.findall(r"(?:-| )(.+)\.pdf$", pdf.name)[0]
    text += f"\\section{{{section_name}}}\n"
    doc = fitz.open(pdf)
    for page in doc:
        for block in page.get_text_blocks():
            block_text = block[4]
            if exclude(block_text):  # image
                continue
            text += block_text
            # Very short block: similar to <span>
            # Relatively long without \n: a line of sentence
            if len(block_text) < 5 or len(block_text) > 23 and "\n" not in block_text[:-1]:
                text += " "
            else:
                text += "\n"

 with open(ROOT / "content.txt", "w", encoding="utf8") as f:
    f.write(text)
diff --git a/sheet.tex b/sheet.tex
 \documentclass[twocolumn, 10pt]{ctexart}
 \usepackage[a4paper, margin=0.5in]{geometry}
 \begin{document}
 \input{content.txt}
 \end{document}
	from pathlib import Path
	import re

	import fitz

	ROOT = Path(__file__).parent
	text = ""

	def exclude(text: str) -> bool:
	for s in ["<", "Int J Radiat Biol"]:
	if text.startswith(s):
	return True


	for pdf in sorted(
	ROOT.glob("第*.pdf"), key=lambda pdf: int(re.findall("(\d+)", pdf.name)[0])
	):
	section_name = re.findall(r"(?:-\| )(.+)\.pdf$", pdf.name)[0]
	text += f"\\section{{{section_name}}}\n"
	doc = fitz.open(pdf)
	for page in doc:
	for block in page.get_text_blocks():
	block_text = block[4]
	if exclude(block_text): # image
	continue
	text += block_text
	# Very short block: similar to <span>
	# Relatively long without \n: a line of sentence
	if len(block_text) < 5 or len(block_text) > 23 and "\n" not in block_text[:-1]:
	text += " "
	else:
	text += "\n"

	with open(ROOT / "content.txt", "w", encoding="utf8") as f:
	f.write(text)
	\documentclass[twocolumn, 10pt]{ctexart}
	\usepackage[a4paper, margin=0.5in]{geometry}
	\begin{document}
	\input{content.txt}
	\end{document}