Skip to content

Instantly share code, notes, and snippets.

@AMasetti
Last active June 11, 2024 18:38
Show Gist options
  • Save AMasetti/906b3560d7bc0cba7e20e5fb9cd3e89c to your computer and use it in GitHub Desktop.
Save AMasetti/906b3560d7bc0cba7e20e5fb9cd3e89c to your computer and use it in GitHub Desktop.
PDF Utils
import fitz # PyMuPDF
import re
def remove_text_containing_string(pdf_path, output_path, target_string):
# Open the original PDF
document = fitz.open(pdf_path)
# Iterate through each page
for page_num in range(document.page_count):
page = document.load_page(page_num)
blocks = page.get_text("dict")["blocks"]
for block in blocks:
if block["type"] == 0: # Type 0 indicates a text block
for line in block["lines"]:
for span in line["spans"]:
if target_string in span["text"]:
print(f"Removing text containing '{target_string}' from page {page_num + 1}")
# Redact the text containing the target string
rect = fitz.Rect(span["bbox"])
page.add_redact_annot(rect)
page.apply_redactions()
# Save the modified PDF to a file
document.save(output_path)
document.close()
pdf_path = "A21-Labs.pdf"
output_path = "A21-Labs_inter.pdf"
target_string = """Text to redact"""
remove_text_containing_string(pdf_path, output_path, target_string)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment