Skip to content

Instantly share code, notes, and snippets.

@ChenyangGao
Created August 27, 2022 14:16
Show Gist options
  • Save ChenyangGao/458485ee187da26b0164886f6bb3403b to your computer and use it in GitHub Desktop.
Save ChenyangGao/458485ee187da26b0164886f6bb3403b to your computer and use it in GitHub Desktop.
删除PDF的底部文本水印 | remove bottom text watermark of PDF
#!/usr/bin/env python3
# coding: utf-8
# Reference:
# https://github.com/pymupdf/PyMuPDF
# https://pymupdf.readthedocs.io/en/latest/
__author__ = "ChenyangGao <https://chenyanggao.github.io/>"
__version__ = (0, 0, 2)
__all__ = ["remove_text_blocks", "remove_last_text_block"]
try:
import fitz # type: ignore
except ImportError:
raise SystemExit("😄 请先安装 pymupdf,请在命令行执行\n pip install pymupdf")
from enum import IntEnum
from fnmatch import translate
from os import PathLike
from re import Pattern, compile as re_compile
from typing import Any, Callable, NamedTuple, Union
EnumSearchType = IntEnum("EnumSearchType", "plain, fnmatch, regexp")
class Block(NamedTuple):
x0: float
y0: float
x1: float
y1: float
content: Any
block_no: int
block_type: int
def ensure_enum(val, cls, /):
if isinstance(val, cls):
return val
try:
return cls[val]
except (KeyError, TypeError):
return cls(val)
def fnmatch_2_regexp(pat: str) -> str:
return translate(pat)[4:-3]
def _make_check(
search_text: Union[str, Pattern, Callable],
search_type: Union[int, str, EnumSearchType] = EnumSearchType.plain,
fullmatch: bool = False,
) -> Callable[[str], bool]:
if callable(search_text):
check = search_text
else:
def check_str(text):
if fullmatch:
return search_text == text
return search_text in text
def check_re(text):
if fullmatch:
return regexp.fullmatch(text) is not None
return regexp.search(text) is not None
if isinstance(search_text, str):
stype: EnumSearchType = ensure_enum(search_type, EnumSearchType)
if stype is EnumSearchType.plain:
check = check_str
elif stype is EnumSearchType.fnmatch:
regexp = re_compile(fnmatch_2_regexp(search_text))
check = check_re
else:
regexp = re_compile(search_text)
check = check_re
else:
regexp = search_text
check = check_re
return check
def _remove_rect(page, x0, y0, x1, y1):
# 参考:https://pymupdf.readthedocs.io/en/latest/rect.html
rect = fitz.Rect(x0, y0, x1, y1)
annot = page.add_redact_annot(rect)
page.apply_redactions(images=fitz.PDF_REDACT_IMAGE_NONE)
def remove_text_blocks(
search_text: Union[str, Pattern, Callable[[str], bool]],
inpath: Union[bytes, str, PathLike],
outpath: Union[bytes, str, PathLike] = "",
search_type: Union[int, str, EnumSearchType] = EnumSearchType.plain,
count: int = 0,
fullmatch: bool = False,
reverse: bool = False,
blocks_slice: Union[int, slice, None] = None,
verbose: bool = False,
) -> dict[int, list[Block]]:
"""删除 PDF 文档中符合搜索条件的文本块
:param search_text: 待搜索的文本或者模式(如果为正则表达式模式,或者函数,则忽略`search_type`)
:param inpath: 输入文档路径
:param outpath: 输出文档路径,如果为空字符串(默认),则输出文档覆盖自身
:param search_type: 搜索类型,可取值:
plain: 纯文本(默认)
fnmatch: Unix Shell 通配符模式
regexp: 正则表达式
:param count: 每页最多搜索多少个,如果小于等于 0(默认),则无限个数
:param fullmatch: 是否整块匹配,默认为 False
:param reverse: 是否从后往前搜索,默认为 False
:param blocks_slice: 限定索引,只搜索某些块
:param verbose: 是否打印文本,说明删除了哪些页面里的文本块,默认为 False
:return: 关于已删除文本块的统计信息
"""
check = _make_check(search_text, search_type, fullmatch)
pdf = fitz.open(inpath)
stat: dict[int, list[Block]] = {} # 统计信息
for i, page in enumerate(pdf):
# https://pymupdf.readthedocs.io/en/latest/page.html#Page.get_text
# 参考:https://pymupdf.readthedocs.io/en/latest/textpage.html#textpage
blocks = page.get_text_blocks() # 相当于 page.get_text("blocks")
if not blocks:
continue
if isinstance(blocks_slice, int):
try:
blocks = [blocks[blocks_slice]]
except IndexError:
continue
else:
if blocks_slice is not None:
blocks = blocks[blocks_slice]
if reverse:
blocks = reversed(blocks)
n = 1
for x0, y0, x1, y1, content, block_no, block_type in blocks:
if block_type != fitz.TEXT_OUTPUT_TEXT:
continue
if check(content):
_remove_rect(page, x0, y0, x1, y1)
block = Block(x0, y0, x1, y1, content, block_no, block_type)
if verbose:
print("[\x1b[38;5;2m\x1b[48;5;1m\x1b[1mDELETED\x1b[0m]", page, "=>", block)
try:
stat[i].append(block)
except KeyError:
stat[i] = [block]
if count > 0 and n == count:
break
n += 1
if stat:
if outpath:
pdf.save(outpath)
else:
pdf.saveIncr()
return stat
def remove_last_text_block(
search_text: Union[str, Pattern, Callable[[str], bool]],
inpath: Union[bytes, str, PathLike],
outpath: Union[bytes, str, PathLike] = "",
search_type: Union[int, str, EnumSearchType] = EnumSearchType.plain,
fullmatch: bool = False,
verbose: bool = False,
) -> dict[int, list[Block]]:
"""判断 PDF 文档,每页的最后一个文本块,如果符合搜索条件,那么将其删除
:param search_text: 待搜索的文本或者模式(如果为正则表达式模式,或者函数,则忽略`search_type`)
:param inpath: 输入文档路径
:param outpath: 输出文档路径,如果为空字符串(默认),则输出文档覆盖自身
:param search_type: 搜索类型,可取值:
plain: 纯文本(默认)
fnmatch: Unix Shell 通配符模式
regexp: 正则表达式
:param fullmatch: 是否整块匹配,默认为 False
:param verbose: 是否打印文本,说明删除了哪些页面里的文本块
:return: 关于已删除文本块的统计信息
"""
return remove_text_blocks(
search_text,
inpath,
outpath,
search_type=search_type,
fullmatch=fullmatch,
blocks_slice=-1,
verbose=verbose,
)
if __name__ == "__main__":
from argparse import ArgumentParser, RawTextHelpFormatter
from glob import glob
from os import chdir, get_terminal_size, makedirs
from os.path import abspath, basename, dirname, isabs, isdir, isfile, join
from sys import argv
parser = ArgumentParser(
description="PDF 页面最底部文本水印去除工具",
formatter_class=RawTextHelpFormatter,
)
parser.add_argument("list", nargs="+", help="文件路径列表,可以是文件或文件夹")
parser.add_argument("-o", "--outdir", default="", help="输出的目录路径,如果不填,则全部覆盖原始文件")
parser.add_argument("-t", "--search_text", required=True, help="搜索的文本或模式")
parser.add_argument("-s", "--search_type", choices=("plain", "fnmatch", "regexp"), default="plain", help="""\
搜索类型,可取值:
plain: 纯文本(默认)
fnmatch: Unix Shell 通配符模式
regexp: 正则表达式""")
parser.add_argument("-f", "--fullmatch", action="store_true", help="是否整块匹配")
parser.add_argument("-v", "--verbose", action="store_true", help="是否打印文本,说明删除了哪些页面里的文本块")
subparsers = parser.add_subparsers(help="快捷工具")
parser_short = subparsers.add_parser("short", help="图灵社区/异步社区 PDF 页面最底部文本水印去除工具")
parser2 = ArgumentParser(
description="图灵社区/异步社区 PDF 页面最底部文本水印去除工具",
formatter_class=RawTextHelpFormatter,
)
subparsers = parser2.add_subparsers(help="图灵社区/异步社区 PDF 页面最底部文本水印去除工具")
parser_short = subparsers.add_parser("short", help="图灵社区/异步社区 PDF 页面最底部文本水印去除工具")
parser_short.add_argument("list", nargs="+", help="文件路径列表,可以是文件或文件夹")
parser_short.add_argument("-o", "--outdir", default="", help="输出的目录路径,如果不填,则全部覆盖原始文件")
parser_short.add_argument("-v", "--verbose", action="store_true", help="是否打印文本,说明删除了哪些页面里的文本块")
if len(argv) < 2:
parser.parse_args(["-h"])
elif argv[1] == "short":
args = parser2.parse_args()
pathlist = args.list
if pathlist is None:
parser.parse_args(["short", "-h"])
args = parser.parse_args(
[*pathlist, "-o", args.outdir, "-t", "社区*尊重版权",
"-s", "fnmatch", *(("-v",) if args.verbose else ())]
)
else:
args = parser.parse_args()
pathlist = args.list
if pathlist is None:
parser.parse_args(["-h"])
outdir = args.outdir
search_text = args.search_text
search_type = args.search_type
fullmatch = args.fullmatch
verbose = args.verbose
def process(pdfpath):
if outdir:
if isabs(pdfpath):
outpath = join(outdir, basename(pdfpath))
else:
outpath = join(outdir, pdfpath)
if dirname(pdfpath):
makedirs(dirname(outpath), exist_ok=True)
else:
outpath = ""
print("-" * get_terminal_size().columns)
print("处理文件:", pdfpath)
stat = remove_last_text_block(
search_text,
pdfpath,
outpath,
search_type=search_type,
fullmatch=fullmatch,
verbose=verbose,
)
count = sum(map(len, stat.values()))
print("删除文本块数:", count)
if count:
print("输出文件:", outpath or pdfpath)
else:
print("文件不做改动")
if outdir:
makedirs(outdir, exist_ok=True)
outdir = abspath(outdir)
for p in pathlist:
if isdir(p):
chdir(p)
for pdfpath in glob("**/*.pdf", recursive=True):
process(pdfpath)
elif isfile(p):
process(p)
else:
print("!!!跳过:", p)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment