Created
August 27, 2022 14:16
-
-
Save ChenyangGao/458485ee187da26b0164886f6bb3403b to your computer and use it in GitHub Desktop.
删除PDF的底部文本水印 | remove bottom text watermark of PDF
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
# coding: utf-8 | |
# Reference: | |
# https://github.com/pymupdf/PyMuPDF | |
# https://pymupdf.readthedocs.io/en/latest/ | |
__author__ = "ChenyangGao <https://chenyanggao.github.io/>" | |
__version__ = (0, 0, 2) | |
__all__ = ["remove_text_blocks", "remove_last_text_block"] | |
try: | |
import fitz # type: ignore | |
except ImportError: | |
raise SystemExit("😄 请先安装 pymupdf,请在命令行执行\n pip install pymupdf") | |
from enum import IntEnum | |
from fnmatch import translate | |
from os import PathLike | |
from re import Pattern, compile as re_compile | |
from typing import Any, Callable, NamedTuple, Union | |
EnumSearchType = IntEnum("EnumSearchType", "plain, fnmatch, regexp") | |
class Block(NamedTuple): | |
x0: float | |
y0: float | |
x1: float | |
y1: float | |
content: Any | |
block_no: int | |
block_type: int | |
def ensure_enum(val, cls, /): | |
if isinstance(val, cls): | |
return val | |
try: | |
return cls[val] | |
except (KeyError, TypeError): | |
return cls(val) | |
def fnmatch_2_regexp(pat: str) -> str: | |
return translate(pat)[4:-3] | |
def _make_check( | |
search_text: Union[str, Pattern, Callable], | |
search_type: Union[int, str, EnumSearchType] = EnumSearchType.plain, | |
fullmatch: bool = False, | |
) -> Callable[[str], bool]: | |
if callable(search_text): | |
check = search_text | |
else: | |
def check_str(text): | |
if fullmatch: | |
return search_text == text | |
return search_text in text | |
def check_re(text): | |
if fullmatch: | |
return regexp.fullmatch(text) is not None | |
return regexp.search(text) is not None | |
if isinstance(search_text, str): | |
stype: EnumSearchType = ensure_enum(search_type, EnumSearchType) | |
if stype is EnumSearchType.plain: | |
check = check_str | |
elif stype is EnumSearchType.fnmatch: | |
regexp = re_compile(fnmatch_2_regexp(search_text)) | |
check = check_re | |
else: | |
regexp = re_compile(search_text) | |
check = check_re | |
else: | |
regexp = search_text | |
check = check_re | |
return check | |
def _remove_rect(page, x0, y0, x1, y1): | |
# 参考:https://pymupdf.readthedocs.io/en/latest/rect.html | |
rect = fitz.Rect(x0, y0, x1, y1) | |
annot = page.add_redact_annot(rect) | |
page.apply_redactions(images=fitz.PDF_REDACT_IMAGE_NONE) | |
def remove_text_blocks( | |
search_text: Union[str, Pattern, Callable[[str], bool]], | |
inpath: Union[bytes, str, PathLike], | |
outpath: Union[bytes, str, PathLike] = "", | |
search_type: Union[int, str, EnumSearchType] = EnumSearchType.plain, | |
count: int = 0, | |
fullmatch: bool = False, | |
reverse: bool = False, | |
blocks_slice: Union[int, slice, None] = None, | |
verbose: bool = False, | |
) -> dict[int, list[Block]]: | |
"""删除 PDF 文档中符合搜索条件的文本块 | |
:param search_text: 待搜索的文本或者模式(如果为正则表达式模式,或者函数,则忽略`search_type`) | |
:param inpath: 输入文档路径 | |
:param outpath: 输出文档路径,如果为空字符串(默认),则输出文档覆盖自身 | |
:param search_type: 搜索类型,可取值: | |
plain: 纯文本(默认) | |
fnmatch: Unix Shell 通配符模式 | |
regexp: 正则表达式 | |
:param count: 每页最多搜索多少个,如果小于等于 0(默认),则无限个数 | |
:param fullmatch: 是否整块匹配,默认为 False | |
:param reverse: 是否从后往前搜索,默认为 False | |
:param blocks_slice: 限定索引,只搜索某些块 | |
:param verbose: 是否打印文本,说明删除了哪些页面里的文本块,默认为 False | |
:return: 关于已删除文本块的统计信息 | |
""" | |
check = _make_check(search_text, search_type, fullmatch) | |
pdf = fitz.open(inpath) | |
stat: dict[int, list[Block]] = {} # 统计信息 | |
for i, page in enumerate(pdf): | |
# https://pymupdf.readthedocs.io/en/latest/page.html#Page.get_text | |
# 参考:https://pymupdf.readthedocs.io/en/latest/textpage.html#textpage | |
blocks = page.get_text_blocks() # 相当于 page.get_text("blocks") | |
if not blocks: | |
continue | |
if isinstance(blocks_slice, int): | |
try: | |
blocks = [blocks[blocks_slice]] | |
except IndexError: | |
continue | |
else: | |
if blocks_slice is not None: | |
blocks = blocks[blocks_slice] | |
if reverse: | |
blocks = reversed(blocks) | |
n = 1 | |
for x0, y0, x1, y1, content, block_no, block_type in blocks: | |
if block_type != fitz.TEXT_OUTPUT_TEXT: | |
continue | |
if check(content): | |
_remove_rect(page, x0, y0, x1, y1) | |
block = Block(x0, y0, x1, y1, content, block_no, block_type) | |
if verbose: | |
print("[\x1b[38;5;2m\x1b[48;5;1m\x1b[1mDELETED\x1b[0m]", page, "=>", block) | |
try: | |
stat[i].append(block) | |
except KeyError: | |
stat[i] = [block] | |
if count > 0 and n == count: | |
break | |
n += 1 | |
if stat: | |
if outpath: | |
pdf.save(outpath) | |
else: | |
pdf.saveIncr() | |
return stat | |
def remove_last_text_block( | |
search_text: Union[str, Pattern, Callable[[str], bool]], | |
inpath: Union[bytes, str, PathLike], | |
outpath: Union[bytes, str, PathLike] = "", | |
search_type: Union[int, str, EnumSearchType] = EnumSearchType.plain, | |
fullmatch: bool = False, | |
verbose: bool = False, | |
) -> dict[int, list[Block]]: | |
"""判断 PDF 文档,每页的最后一个文本块,如果符合搜索条件,那么将其删除 | |
:param search_text: 待搜索的文本或者模式(如果为正则表达式模式,或者函数,则忽略`search_type`) | |
:param inpath: 输入文档路径 | |
:param outpath: 输出文档路径,如果为空字符串(默认),则输出文档覆盖自身 | |
:param search_type: 搜索类型,可取值: | |
plain: 纯文本(默认) | |
fnmatch: Unix Shell 通配符模式 | |
regexp: 正则表达式 | |
:param fullmatch: 是否整块匹配,默认为 False | |
:param verbose: 是否打印文本,说明删除了哪些页面里的文本块 | |
:return: 关于已删除文本块的统计信息 | |
""" | |
return remove_text_blocks( | |
search_text, | |
inpath, | |
outpath, | |
search_type=search_type, | |
fullmatch=fullmatch, | |
blocks_slice=-1, | |
verbose=verbose, | |
) | |
if __name__ == "__main__": | |
from argparse import ArgumentParser, RawTextHelpFormatter | |
from glob import glob | |
from os import chdir, get_terminal_size, makedirs | |
from os.path import abspath, basename, dirname, isabs, isdir, isfile, join | |
from sys import argv | |
parser = ArgumentParser( | |
description="PDF 页面最底部文本水印去除工具", | |
formatter_class=RawTextHelpFormatter, | |
) | |
parser.add_argument("list", nargs="+", help="文件路径列表,可以是文件或文件夹") | |
parser.add_argument("-o", "--outdir", default="", help="输出的目录路径,如果不填,则全部覆盖原始文件") | |
parser.add_argument("-t", "--search_text", required=True, help="搜索的文本或模式") | |
parser.add_argument("-s", "--search_type", choices=("plain", "fnmatch", "regexp"), default="plain", help="""\ | |
搜索类型,可取值: | |
plain: 纯文本(默认) | |
fnmatch: Unix Shell 通配符模式 | |
regexp: 正则表达式""") | |
parser.add_argument("-f", "--fullmatch", action="store_true", help="是否整块匹配") | |
parser.add_argument("-v", "--verbose", action="store_true", help="是否打印文本,说明删除了哪些页面里的文本块") | |
subparsers = parser.add_subparsers(help="快捷工具") | |
parser_short = subparsers.add_parser("short", help="图灵社区/异步社区 PDF 页面最底部文本水印去除工具") | |
parser2 = ArgumentParser( | |
description="图灵社区/异步社区 PDF 页面最底部文本水印去除工具", | |
formatter_class=RawTextHelpFormatter, | |
) | |
subparsers = parser2.add_subparsers(help="图灵社区/异步社区 PDF 页面最底部文本水印去除工具") | |
parser_short = subparsers.add_parser("short", help="图灵社区/异步社区 PDF 页面最底部文本水印去除工具") | |
parser_short.add_argument("list", nargs="+", help="文件路径列表,可以是文件或文件夹") | |
parser_short.add_argument("-o", "--outdir", default="", help="输出的目录路径,如果不填,则全部覆盖原始文件") | |
parser_short.add_argument("-v", "--verbose", action="store_true", help="是否打印文本,说明删除了哪些页面里的文本块") | |
if len(argv) < 2: | |
parser.parse_args(["-h"]) | |
elif argv[1] == "short": | |
args = parser2.parse_args() | |
pathlist = args.list | |
if pathlist is None: | |
parser.parse_args(["short", "-h"]) | |
args = parser.parse_args( | |
[*pathlist, "-o", args.outdir, "-t", "社区*尊重版权", | |
"-s", "fnmatch", *(("-v",) if args.verbose else ())] | |
) | |
else: | |
args = parser.parse_args() | |
pathlist = args.list | |
if pathlist is None: | |
parser.parse_args(["-h"]) | |
outdir = args.outdir | |
search_text = args.search_text | |
search_type = args.search_type | |
fullmatch = args.fullmatch | |
verbose = args.verbose | |
def process(pdfpath): | |
if outdir: | |
if isabs(pdfpath): | |
outpath = join(outdir, basename(pdfpath)) | |
else: | |
outpath = join(outdir, pdfpath) | |
if dirname(pdfpath): | |
makedirs(dirname(outpath), exist_ok=True) | |
else: | |
outpath = "" | |
print("-" * get_terminal_size().columns) | |
print("处理文件:", pdfpath) | |
stat = remove_last_text_block( | |
search_text, | |
pdfpath, | |
outpath, | |
search_type=search_type, | |
fullmatch=fullmatch, | |
verbose=verbose, | |
) | |
count = sum(map(len, stat.values())) | |
print("删除文本块数:", count) | |
if count: | |
print("输出文件:", outpath or pdfpath) | |
else: | |
print("文件不做改动") | |
if outdir: | |
makedirs(outdir, exist_ok=True) | |
outdir = abspath(outdir) | |
for p in pathlist: | |
if isdir(p): | |
chdir(p) | |
for pdfpath in glob("**/*.pdf", recursive=True): | |
process(pdfpath) | |
elif isfile(p): | |
process(p) | |
else: | |
print("!!!跳过:", p) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment