AMasetti · June 11, 2024 18:38
diff --git a/pdf_utils.py b/pdf_utils.py
 import fitz  # PyMuPDF
 import re

 def remove_text_containing_string(pdf_path, output_path, target_string):
    # Open the original PDF
    document = fitz.open(pdf_path)

    # Iterate through each page
    for page_num in range(document.page_count):
        page = document.load_page(page_num)
        blocks = page.get_text("dict")["blocks"]

        for block in blocks:
            if block["type"] == 0:  # Type 0 indicates a text block
                for line in block["lines"]:
                    for span in line["spans"]:
                        if target_string in span["text"]:
                            print(f"Removing text containing '{target_string}' from page {page_num + 1}")
                            # Redact the text containing the target string
                            rect = fitz.Rect(span["bbox"])
                            page.add_redact_annot(rect)
                            page.apply_redactions()

    # Save the modified PDF to a file
    document.save(output_path)
    document.close()

 pdf_path = "A21-Labs.pdf"
 output_path = "A21-Labs_inter.pdf"
 target_string = """Text to redact"""
 remove_text_containing_string(pdf_path, output_path, target_string)
	import fitz # PyMuPDF
	import re

	def remove_text_containing_string(pdf_path, output_path, target_string):
	# Open the original PDF
	document = fitz.open(pdf_path)

	# Iterate through each page
	for page_num in range(document.page_count):
	page = document.load_page(page_num)
	blocks = page.get_text("dict")["blocks"]

	for block in blocks:
	if block["type"] == 0: # Type 0 indicates a text block
	for line in block["lines"]:
	for span in line["spans"]:
	if target_string in span["text"]:
	print(f"Removing text containing '{target_string}' from page {page_num + 1}")
	# Redact the text containing the target string
	rect = fitz.Rect(span["bbox"])
	page.add_redact_annot(rect)
	page.apply_redactions()

	# Save the modified PDF to a file
	document.save(output_path)
	document.close()

	pdf_path = "A21-Labs.pdf"
	output_path = "A21-Labs_inter.pdf"
	target_string = """Text to redact"""
	remove_text_containing_string(pdf_path, output_path, target_string)