Last active
June 11, 2024 18:38
-
-
Save AMasetti/906b3560d7bc0cba7e20e5fb9cd3e89c to your computer and use it in GitHub Desktop.
PDF Utils
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import fitz # PyMuPDF | |
import re | |
def remove_text_containing_string(pdf_path, output_path, target_string): | |
# Open the original PDF | |
document = fitz.open(pdf_path) | |
# Iterate through each page | |
for page_num in range(document.page_count): | |
page = document.load_page(page_num) | |
blocks = page.get_text("dict")["blocks"] | |
for block in blocks: | |
if block["type"] == 0: # Type 0 indicates a text block | |
for line in block["lines"]: | |
for span in line["spans"]: | |
if target_string in span["text"]: | |
print(f"Removing text containing '{target_string}' from page {page_num + 1}") | |
# Redact the text containing the target string | |
rect = fitz.Rect(span["bbox"]) | |
page.add_redact_annot(rect) | |
page.apply_redactions() | |
# Save the modified PDF to a file | |
document.save(output_path) | |
document.close() | |
pdf_path = "A21-Labs.pdf" | |
output_path = "A21-Labs_inter.pdf" | |
target_string = """Text to redact""" | |
remove_text_containing_string(pdf_path, output_path, target_string) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment