Last active
October 30, 2025 11:08
-
-
Save wangjiezhe/9b74cf9d492a958c90360a16780a2d12 to your computer and use it in GitHub Desktop.
Format markdown files by OCR
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env python | |
| import re | |
| from pathlib import Path | |
| import typer | |
| app = typer.Typer() | |
| def parse_multiline_formula(match: re.Match[str]) -> str: | |
| block = match.group(1) | |
| formulas = re.split(r"\\\] \\\[", block) | |
| if len(formulas) == 1: | |
| return match.group(0) | |
| res = r"\[\begin{aligned}" | |
| res += "\n" | |
| for i, formula in enumerate(formulas): | |
| res += f"&{formula}" | |
| if i < len(formulas) - 1: | |
| res += r"\\" | |
| res += "\n" | |
| res += r"\end{aligned}\]" | |
| return res | |
| def parse_parallel(match: re.Match[str]) -> str: | |
| res = match.group(2) | |
| res = re.sub(r"/\s*/", r"\\parallel", res) | |
| return match.group(1) + res + match.group(3) | |
| def format_deepseek(content: str) -> str: | |
| """ | |
| Format markdown file OCR by DeepSeek-OCR | |
| """ | |
| # 去掉换页标记 | |
| content = content.replace("<--- Page Split --->\n", "") | |
| # 替换识别错误的乘号 | |
| content = content.replace(r"\bullet", r"\cdot") | |
| # 替换平行符号 | |
| content = re.sub(r"(\\\()(.*?)(\\\))", parse_parallel, content) | |
| content = re.sub(r"(\\\[)(.*?)(\\\])", parse_parallel, content) | |
| content = re.sub( | |
| r"\\\((.*?)\\\) // \\\((.*?)\\\)", r"\(\1 \\parallel \2\)", content | |
| ) | |
| # 合并多行公式 | |
| content = re.sub(r"\\\[(.*)\\\]", parse_multiline_formula, content) | |
| # 正确显示多行公式 | |
| content = re.sub(r"\\\[(.*?)\\\]", r"\[\n\1\n\]", content, flags=re.DOTALL) | |
| return content | |
| def format_paddle(content: str) -> str: | |
| """ | |
| Format markdown file OCR by PaddleOCR | |
| """ | |
| # 替换识别错误的乘号 | |
| content = content.replace(r"\bullet", r"\cdot") | |
| # 替换平行符号 | |
| content = re.sub(r"(\$)(.+?)(\$)", parse_parallel, content) | |
| # 正确显示多行公式 | |
| content = re.sub(r"\$\$(.+?)\$\$", r"$$\n\1\n$$", content) | |
| return content | |
| @app.command() | |
| def main( | |
| input_file: Path = typer.Argument(..., help="Input markdown file"), | |
| formatter: str = typer.Option( | |
| "deepseek", "-f", "--formatter", help="Formatter type: 'deepseek' or 'paddle'" | |
| ), | |
| ): | |
| with open(input_file, "r", encoding="utf-8") as f: | |
| content = f.read() | |
| if formatter == "deepseek": | |
| content = format_deepseek(content) | |
| elif formatter == "paddle": | |
| content = format_paddle(content) | |
| else: | |
| raise typer.BadParameter("Formatter must be either 'deepseek' or 'paddle'") | |
| with open( | |
| f"{input_file.stem}_modified{input_file.suffix}", | |
| "w", | |
| encoding="utf-8", | |
| newline="\n", | |
| ) as f: | |
| _ = f.write(content) | |
| if __name__ == "__main__": | |
| app() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment