ChenyangGao · August 27, 2022 14:16
diff --git a/remove_bottom_text_of_pdf.py b/remove_bottom_text_of_pdf.py
 #!/usr/bin/env python3
 # coding: utf-8

 # Reference:
 # https://github.com/pymupdf/PyMuPDF
 # https://pymupdf.readthedocs.io/en/latest/

 __author__ = "ChenyangGao <https://chenyanggao.github.io/>"
 __version__ = (0, 0, 2)
 __all__ = ["remove_text_blocks", "remove_last_text_block"]

 try:
    import fitz # type: ignore
 except ImportError:
    raise SystemExit("😄 请先安装 pymupdf，请在命令行执行\n    pip install pymupdf")

 from enum import IntEnum
 from fnmatch import translate
 from os import PathLike
 from re import Pattern, compile as re_compile
 from typing import Any, Callable, NamedTuple, Union


 EnumSearchType = IntEnum("EnumSearchType", "plain, fnmatch, regexp")


 class Block(NamedTuple):
    x0: float
    y0: float
    x1: float
    y1: float
    content: Any
    block_no: int
    block_type: int


 def ensure_enum(val, cls, /):
    if isinstance(val, cls):
        return val
    try:
        return cls[val]
    except (KeyError, TypeError):
        return cls(val)


 def fnmatch_2_regexp(pat: str) -> str:
    return translate(pat)[4:-3]


 def _make_check(
    search_text: Union[str, Pattern, Callable], 
    search_type: Union[int, str, EnumSearchType] = EnumSearchType.plain, 
    fullmatch: bool = False, 
 ) -> Callable[[str], bool]:
    if callable(search_text):
        check = search_text
    else:
        def check_str(text):
            if fullmatch:
                return search_text == text
            return search_text in text
        def check_re(text):
            if fullmatch:
                return regexp.fullmatch(text) is not None
            return regexp.search(text) is not None
        if isinstance(search_text, str):
            stype: EnumSearchType = ensure_enum(search_type, EnumSearchType)
            if stype is EnumSearchType.plain:
                check = check_str
            elif stype is EnumSearchType.fnmatch:
                regexp = re_compile(fnmatch_2_regexp(search_text))
                check = check_re
            else:
                regexp = re_compile(search_text)
                check = check_re
        else:
            regexp = search_text
            check = check_re
    return check


 def _remove_rect(page, x0, y0, x1, y1):
    # 参考：https://pymupdf.readthedocs.io/en/latest/rect.html
    rect = fitz.Rect(x0, y0, x1, y1)
    annot = page.add_redact_annot(rect)
    page.apply_redactions(images=fitz.PDF_REDACT_IMAGE_NONE)


 def remove_text_blocks(
    search_text: Union[str, Pattern, Callable[[str], bool]], 
    inpath: Union[bytes, str, PathLike], 
    outpath: Union[bytes, str, PathLike] = "", 
    search_type: Union[int, str, EnumSearchType] = EnumSearchType.plain, 
    count: int = 0, 
    fullmatch: bool = False, 
    reverse: bool = False, 
    blocks_slice: Union[int, slice, None] = None, 
    verbose: bool = False, 
 ) -> dict[int, list[Block]]:
    """删除 PDF 文档中符合搜索条件的文本块

    :param search_text: 待搜索的文本或者模式（如果为正则表达式模式，或者函数，则忽略`search_type`）
    :param inpath: 输入文档路径
    :param outpath: 输出文档路径，如果为空字符串（默认），则输出文档覆盖自身
    :param search_type: 搜索类型，可取值：
        plain: 纯文本（默认）
        fnmatch: Unix Shell 通配符模式
        regexp: 正则表达式
    :param count: 每页最多搜索多少个，如果小于等于 0（默认），则无限个数
    :param fullmatch: 是否整块匹配，默认为 False
    :param reverse: 是否从后往前搜索，默认为 False
    :param blocks_slice: 限定索引，只搜索某些块
    :param verbose: 是否打印文本，说明删除了哪些页面里的文本块，默认为 False

    :return: 关于已删除文本块的统计信息
    """
    check = _make_check(search_text, search_type, fullmatch)
    pdf = fitz.open(inpath)
    stat: dict[int, list[Block]] = {} # 统计信息
    for i, page in enumerate(pdf):
        # https://pymupdf.readthedocs.io/en/latest/page.html#Page.get_text
        # 参考：https://pymupdf.readthedocs.io/en/latest/textpage.html#textpage
        blocks = page.get_text_blocks() # 相当于 page.get_text("blocks")
        if not blocks:
            continue
        if isinstance(blocks_slice, int):
            try:
                blocks = [blocks[blocks_slice]]
            except IndexError:
                continue
        else:
            if blocks_slice is not None:
                blocks = blocks[blocks_slice]
            if reverse:
                blocks = reversed(blocks)
        n = 1
        for x0, y0, x1, y1, content, block_no, block_type in blocks:
            if block_type != fitz.TEXT_OUTPUT_TEXT:
                continue
            if check(content):
                _remove_rect(page, x0, y0, x1, y1)
                block = Block(x0, y0, x1, y1, content, block_no, block_type)
                if verbose:
                    print("[\x1b[38;5;2m\x1b[48;5;1m\x1b[1mDELETED\x1b[0m]", page, "=>", block)
                try:
                    stat[i].append(block)
                except KeyError:
                    stat[i] = [block]
                if count > 0 and n == count:
                    break
                n += 1
    if stat:
        if outpath:
            pdf.save(outpath)
        else:
            pdf.saveIncr()
    return stat


 def remove_last_text_block(
    search_text: Union[str, Pattern, Callable[[str], bool]], 
    inpath: Union[bytes, str, PathLike], 
    outpath: Union[bytes, str, PathLike] = "", 
    search_type: Union[int, str, EnumSearchType] = EnumSearchType.plain, 
    fullmatch: bool = False, 
    verbose: bool = False, 
 ) -> dict[int, list[Block]]:
    """判断 PDF 文档，每页的最后一个文本块，如果符合搜索条件，那么将其删除

    :param search_text: 待搜索的文本或者模式（如果为正则表达式模式，或者函数，则忽略`search_type`）
    :param inpath: 输入文档路径
    :param outpath: 输出文档路径，如果为空字符串（默认），则输出文档覆盖自身
    :param search_type: 搜索类型，可取值：
        plain: 纯文本（默认）
        fnmatch: Unix Shell 通配符模式
        regexp: 正则表达式
    :param fullmatch: 是否整块匹配，默认为 False
    :param verbose: 是否打印文本，说明删除了哪些页面里的文本块

    :return: 关于已删除文本块的统计信息
    """
    return remove_text_blocks(
        search_text, 
        inpath, 
        outpath, 
        search_type=search_type, 
        fullmatch=fullmatch, 
        blocks_slice=-1, 
        verbose=verbose, 
    )


 if __name__ == "__main__":
    from argparse import ArgumentParser, RawTextHelpFormatter
    from glob import glob
    from os import chdir, get_terminal_size, makedirs
    from os.path import abspath, basename, dirname, isabs, isdir, isfile, join
    from sys import argv

    parser = ArgumentParser(
        description="PDF 页面最底部文本水印去除工具", 
        formatter_class=RawTextHelpFormatter, 
    )
    parser.add_argument("list", nargs="+", help="文件路径列表，可以是文件或文件夹")
    parser.add_argument("-o", "--outdir", default="", help="输出的目录路径，如果不填，则全部覆盖原始文件")
    parser.add_argument("-t", "--search_text", required=True, help="搜索的文本或模式")
    parser.add_argument("-s", "--search_type", choices=("plain", "fnmatch", "regexp"), default="plain", help="""\
 搜索类型，可取值：
    plain: 纯文本（默认）
    fnmatch: Unix Shell 通配符模式
    regexp: 正则表达式""")
    parser.add_argument("-f", "--fullmatch", action="store_true", help="是否整块匹配")
    parser.add_argument("-v", "--verbose", action="store_true", help="是否打印文本，说明删除了哪些页面里的文本块")
    subparsers = parser.add_subparsers(help="快捷工具")
    parser_short = subparsers.add_parser("short", help="图灵社区/异步社区 PDF 页面最底部文本水印去除工具")

    parser2 = ArgumentParser(
        description="图灵社区/异步社区 PDF 页面最底部文本水印去除工具", 
        formatter_class=RawTextHelpFormatter, 
    )
    subparsers = parser2.add_subparsers(help="图灵社区/异步社区 PDF 页面最底部文本水印去除工具")
    parser_short = subparsers.add_parser("short", help="图灵社区/异步社区 PDF 页面最底部文本水印去除工具")
    parser_short.add_argument("list", nargs="+", help="文件路径列表，可以是文件或文件夹")
    parser_short.add_argument("-o", "--outdir", default="", help="输出的目录路径，如果不填，则全部覆盖原始文件")
    parser_short.add_argument("-v", "--verbose", action="store_true", help="是否打印文本，说明删除了哪些页面里的文本块")

    if len(argv) < 2:
        parser.parse_args(["-h"])
    elif argv[1] == "short":
        args = parser2.parse_args()
        pathlist = args.list
        if pathlist is None:
            parser.parse_args(["short", "-h"])
        args = parser.parse_args(
            [*pathlist, "-o", args.outdir, "-t", "社区*尊重版权", 
            "-s", "fnmatch", *(("-v",) if args.verbose else ())]
        )
    else:
        args = parser.parse_args()
        pathlist = args.list
        if pathlist is None:
            parser.parse_args(["-h"])

    outdir = args.outdir
    search_text = args.search_text
    search_type = args.search_type
    fullmatch = args.fullmatch
    verbose = args.verbose

    def process(pdfpath):
        if outdir:
            if isabs(pdfpath):
                outpath = join(outdir, basename(pdfpath))
            else:
                outpath = join(outdir, pdfpath)
                if dirname(pdfpath):
                    makedirs(dirname(outpath), exist_ok=True)
        else:
            outpath = ""
        print("-" * get_terminal_size().columns)
        print("处理文件：", pdfpath)
        stat = remove_last_text_block(
            search_text, 
            pdfpath, 
            outpath, 
            search_type=search_type, 
            fullmatch=fullmatch, 
            verbose=verbose, 
        )
        count = sum(map(len, stat.values()))
        print("删除文本块数：", count)
        if count:
            print("输出文件：", outpath or pdfpath)
        else:
            print("文件不做改动")

    if outdir:
        makedirs(outdir, exist_ok=True)
        outdir = abspath(outdir)
    for p in pathlist:
        if isdir(p):
            chdir(p)
            for pdfpath in glob("**/*.pdf", recursive=True):
                process(pdfpath)
        elif isfile(p):
            process(p)
        else:
            print("！！！跳过：", p)
	#!/usr/bin/env python3
	# coding: utf-8

	# Reference:
	# https://github.com/pymupdf/PyMuPDF
	# https://pymupdf.readthedocs.io/en/latest/

	__author__ = "ChenyangGao <https://chenyanggao.github.io/>"
	__version__ = (0, 0, 2)
	__all__ = ["remove_text_blocks", "remove_last_text_block"]

	try:
	import fitz # type: ignore
	except ImportError:
	raise SystemExit("😄 请先安装 pymupdf，请在命令行执行\n pip install pymupdf")

	from enum import IntEnum
	from fnmatch import translate
	from os import PathLike
	from re import Pattern, compile as re_compile
	from typing import Any, Callable, NamedTuple, Union


	EnumSearchType = IntEnum("EnumSearchType", "plain, fnmatch, regexp")


	class Block(NamedTuple):
	x0: float
	y0: float
	x1: float
	y1: float
	content: Any
	block_no: int
	block_type: int


	def ensure_enum(val, cls, /):
	if isinstance(val, cls):
	return val
	try:
	return cls[val]
	except (KeyError, TypeError):
	return cls(val)


	def fnmatch_2_regexp(pat: str) -> str:
	return translate(pat)[4:-3]


	def _make_check(
	search_text: Union[str, Pattern, Callable],
	search_type: Union[int, str, EnumSearchType] = EnumSearchType.plain,
	fullmatch: bool = False,
	) -> Callable[[str], bool]:
	if callable(search_text):
	check = search_text
	else:
	def check_str(text):
	if fullmatch:
	return search_text == text
	return search_text in text
	def check_re(text):
	if fullmatch:
	return regexp.fullmatch(text) is not None
	return regexp.search(text) is not None
	if isinstance(search_text, str):
	stype: EnumSearchType = ensure_enum(search_type, EnumSearchType)
	if stype is EnumSearchType.plain:
	check = check_str
	elif stype is EnumSearchType.fnmatch:
	regexp = re_compile(fnmatch_2_regexp(search_text))
	check = check_re
	else:
	regexp = re_compile(search_text)
	check = check_re
	else:
	regexp = search_text
	check = check_re
	return check


	def _remove_rect(page, x0, y0, x1, y1):
	# 参考：https://pymupdf.readthedocs.io/en/latest/rect.html
	rect = fitz.Rect(x0, y0, x1, y1)
	annot = page.add_redact_annot(rect)
	page.apply_redactions(images=fitz.PDF_REDACT_IMAGE_NONE)


	def remove_text_blocks(
	search_text: Union[str, Pattern, Callable[[str], bool]],
	inpath: Union[bytes, str, PathLike],
	outpath: Union[bytes, str, PathLike] = "",
	search_type: Union[int, str, EnumSearchType] = EnumSearchType.plain,
	count: int = 0,
	fullmatch: bool = False,
	reverse: bool = False,
	blocks_slice: Union[int, slice, None] = None,
	verbose: bool = False,
	) -> dict[int, list[Block]]:
	"""删除 PDF 文档中符合搜索条件的文本块

	:param search_text: 待搜索的文本或者模式（如果为正则表达式模式，或者函数，则忽略`search_type`）
	:param inpath: 输入文档路径
	:param outpath: 输出文档路径，如果为空字符串（默认），则输出文档覆盖自身
	:param search_type: 搜索类型，可取值：
	plain: 纯文本（默认）
	fnmatch: Unix Shell 通配符模式
	regexp: 正则表达式
	:param count: 每页最多搜索多少个，如果小于等于 0（默认），则无限个数
	:param fullmatch: 是否整块匹配，默认为 False
	:param reverse: 是否从后往前搜索，默认为 False
	:param blocks_slice: 限定索引，只搜索某些块
	:param verbose: 是否打印文本，说明删除了哪些页面里的文本块，默认为 False

	:return: 关于已删除文本块的统计信息
	"""
	check = _make_check(search_text, search_type, fullmatch)
	pdf = fitz.open(inpath)
	stat: dict[int, list[Block]] = {} # 统计信息
	for i, page in enumerate(pdf):
	# https://pymupdf.readthedocs.io/en/latest/page.html#Page.get_text
	# 参考：https://pymupdf.readthedocs.io/en/latest/textpage.html#textpage
	blocks = page.get_text_blocks() # 相当于 page.get_text("blocks")
	if not blocks:
	continue
	if isinstance(blocks_slice, int):
	try:
	blocks = [blocks[blocks_slice]]
	except IndexError:
	continue
	else:
	if blocks_slice is not None:
	blocks = blocks[blocks_slice]
	if reverse:
	blocks = reversed(blocks)
	n = 1
	for x0, y0, x1, y1, content, block_no, block_type in blocks:
	if block_type != fitz.TEXT_OUTPUT_TEXT:
	continue
	if check(content):
	_remove_rect(page, x0, y0, x1, y1)
	block = Block(x0, y0, x1, y1, content, block_no, block_type)
	if verbose:
	print("[\x1b[38;5;2m\x1b[48;5;1m\x1b[1mDELETED\x1b[0m]", page, "=>", block)
	try:
	stat[i].append(block)
	except KeyError:
	stat[i] = [block]
	if count > 0 and n == count:
	break
	n += 1
	if stat:
	if outpath:
	pdf.save(outpath)
	else:
	pdf.saveIncr()
	return stat


	def remove_last_text_block(
	search_text: Union[str, Pattern, Callable[[str], bool]],
	inpath: Union[bytes, str, PathLike],
	outpath: Union[bytes, str, PathLike] = "",
	search_type: Union[int, str, EnumSearchType] = EnumSearchType.plain,
	fullmatch: bool = False,
	verbose: bool = False,
	) -> dict[int, list[Block]]:
	"""判断 PDF 文档，每页的最后一个文本块，如果符合搜索条件，那么将其删除

	:param search_text: 待搜索的文本或者模式（如果为正则表达式模式，或者函数，则忽略`search_type`）
	:param inpath: 输入文档路径
	:param outpath: 输出文档路径，如果为空字符串（默认），则输出文档覆盖自身
	:param search_type: 搜索类型，可取值：
	plain: 纯文本（默认）
	fnmatch: Unix Shell 通配符模式
	regexp: 正则表达式
	:param fullmatch: 是否整块匹配，默认为 False
	:param verbose: 是否打印文本，说明删除了哪些页面里的文本块

	:return: 关于已删除文本块的统计信息
	"""
	return remove_text_blocks(
	search_text,
	inpath,
	outpath,
	search_type=search_type,
	fullmatch=fullmatch,
	blocks_slice=-1,
	verbose=verbose,
	)


	if __name__ == "__main__":
	from argparse import ArgumentParser, RawTextHelpFormatter
	from glob import glob
	from os import chdir, get_terminal_size, makedirs
	from os.path import abspath, basename, dirname, isabs, isdir, isfile, join
	from sys import argv

	parser = ArgumentParser(
	description="PDF 页面最底部文本水印去除工具",
	formatter_class=RawTextHelpFormatter,
	)
	parser.add_argument("list", nargs="+", help="文件路径列表，可以是文件或文件夹")
	parser.add_argument("-o", "--outdir", default="", help="输出的目录路径，如果不填，则全部覆盖原始文件")
	parser.add_argument("-t", "--search_text", required=True, help="搜索的文本或模式")
	parser.add_argument("-s", "--search_type", choices=("plain", "fnmatch", "regexp"), default="plain", help="""\
	搜索类型，可取值：
	plain: 纯文本（默认）
	fnmatch: Unix Shell 通配符模式
	regexp: 正则表达式""")
	parser.add_argument("-f", "--fullmatch", action="store_true", help="是否整块匹配")
	parser.add_argument("-v", "--verbose", action="store_true", help="是否打印文本，说明删除了哪些页面里的文本块")
	subparsers = parser.add_subparsers(help="快捷工具")
	parser_short = subparsers.add_parser("short", help="图灵社区/异步社区 PDF 页面最底部文本水印去除工具")

	parser2 = ArgumentParser(
	description="图灵社区/异步社区 PDF 页面最底部文本水印去除工具",
	formatter_class=RawTextHelpFormatter,
	)
	subparsers = parser2.add_subparsers(help="图灵社区/异步社区 PDF 页面最底部文本水印去除工具")
	parser_short = subparsers.add_parser("short", help="图灵社区/异步社区 PDF 页面最底部文本水印去除工具")
	parser_short.add_argument("list", nargs="+", help="文件路径列表，可以是文件或文件夹")
	parser_short.add_argument("-o", "--outdir", default="", help="输出的目录路径，如果不填，则全部覆盖原始文件")
	parser_short.add_argument("-v", "--verbose", action="store_true", help="是否打印文本，说明删除了哪些页面里的文本块")

	if len(argv) < 2:
	parser.parse_args(["-h"])
	elif argv[1] == "short":
	args = parser2.parse_args()
	pathlist = args.list
	if pathlist is None:
	parser.parse_args(["short", "-h"])
	args = parser.parse_args(
	[pathlist, "-o", args.outdir, "-t", "社区尊重版权",
	"-s", "fnmatch", *(("-v",) if args.verbose else ())]
	)
	else:
	args = parser.parse_args()
	pathlist = args.list
	if pathlist is None:
	parser.parse_args(["-h"])

	outdir = args.outdir
	search_text = args.search_text
	search_type = args.search_type
	fullmatch = args.fullmatch
	verbose = args.verbose

	def process(pdfpath):
	if outdir:
	if isabs(pdfpath):
	outpath = join(outdir, basename(pdfpath))
	else:
	outpath = join(outdir, pdfpath)
	if dirname(pdfpath):
	makedirs(dirname(outpath), exist_ok=True)
	else:
	outpath = ""
	print("-" * get_terminal_size().columns)
	print("处理文件：", pdfpath)
	stat = remove_last_text_block(
	search_text,
	pdfpath,
	outpath,
	search_type=search_type,
	fullmatch=fullmatch,
	verbose=verbose,
	)
	count = sum(map(len, stat.values()))
	print("删除文本块数：", count)
	if count:
	print("输出文件：", outpath or pdfpath)
	else:
	print("文件不做改动")

	if outdir:
	makedirs(outdir, exist_ok=True)
	outdir = abspath(outdir)
	for p in pathlist:
	if isdir(p):
	chdir(p)
	for pdfpath in glob("*/.pdf", recursive=True):
	process(pdfpath)
	elif isfile(p):
	process(p)
	else:
	print("！！！跳过：", p)