Created
December 28, 2024 20:40
-
-
Save Jeel-Shah/e384369ffe6dae85678d74948b9a9c83 to your computer and use it in GitHub Desktop.
How to remove a "text-based" watermark from a PDF
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
## This code was generated by ChatGPT-4o after several iterations. | |
from PyPDF2 import PdfReader, PdfWriter | |
from PyPDF2.generic import StreamObject, ArrayObject, NameObject | |
def remove_watermark(input_pdf, output_pdf, watermark_text=r"your_watermark"): | |
reader = PdfReader(input_pdf) | |
writer = PdfWriter() | |
for page_num, page in enumerate(reader.pages): | |
try: | |
print(f"Processing page {page_num + 1}/{len(reader.pages)}") | |
# Check if the page has a content stream | |
if "/Contents" in page: | |
contents = page["/Contents"] | |
if isinstance(contents, ArrayObject): | |
# Multiple content streams | |
new_streams = [] | |
for content in contents: | |
stream = content.get_object() | |
raw_stream = stream.get_data().decode("utf-8", errors="ignore") | |
# Remove the watermark text | |
updated_stream = raw_stream.replace(watermark_text, "") | |
# Convert back to bytes and update | |
new_stream = StreamObject() | |
new_stream._data = updated_stream.encode("utf-8") | |
new_streams.append(new_stream) | |
# Update the page contents | |
page[NameObject("/Contents")] = ArrayObject(new_streams) | |
else: | |
# Single content stream | |
stream = contents.get_object() | |
raw_stream = stream.get_data().decode("utf-8", errors="ignore") | |
# Remove the watermark text | |
updated_stream = raw_stream.replace(watermark_text, "") | |
# Convert back to bytes and update | |
new_stream = StreamObject() | |
new_stream._data = updated_stream.encode("utf-8") | |
page[NameObject("/Contents")] = new_stream | |
# Add the modified page to the writer | |
writer.add_page(page) | |
except Exception as e: | |
print(f"Error on page {page_num + 1}: {e}") | |
continue | |
# Save the modified PDF | |
with open(output_pdf, "wb") as f: | |
writer.write(f) | |
print(f"Watermark '{watermark_text}' removed and saved as {output_pdf}") | |
# Replace these paths with your actual file paths | |
input_pdf_path = "/path/to/pdf.pdf" | |
output_pdf_path = "/path/to/new.pdf" | |
remove_watermark(input_pdf_path, output_pdf_path) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment