Skip to content

Instantly share code, notes, and snippets.

@Jeel-Shah
Created December 28, 2024 20:40
Show Gist options
  • Save Jeel-Shah/e384369ffe6dae85678d74948b9a9c83 to your computer and use it in GitHub Desktop.
Save Jeel-Shah/e384369ffe6dae85678d74948b9a9c83 to your computer and use it in GitHub Desktop.
How to remove a "text-based" watermark from a PDF
## This code was generated by ChatGPT-4o after several iterations.
from PyPDF2 import PdfReader, PdfWriter
from PyPDF2.generic import StreamObject, ArrayObject, NameObject
def remove_watermark(input_pdf, output_pdf, watermark_text=r"your_watermark"):
reader = PdfReader(input_pdf)
writer = PdfWriter()
for page_num, page in enumerate(reader.pages):
try:
print(f"Processing page {page_num + 1}/{len(reader.pages)}")
# Check if the page has a content stream
if "/Contents" in page:
contents = page["/Contents"]
if isinstance(contents, ArrayObject):
# Multiple content streams
new_streams = []
for content in contents:
stream = content.get_object()
raw_stream = stream.get_data().decode("utf-8", errors="ignore")
# Remove the watermark text
updated_stream = raw_stream.replace(watermark_text, "")
# Convert back to bytes and update
new_stream = StreamObject()
new_stream._data = updated_stream.encode("utf-8")
new_streams.append(new_stream)
# Update the page contents
page[NameObject("/Contents")] = ArrayObject(new_streams)
else:
# Single content stream
stream = contents.get_object()
raw_stream = stream.get_data().decode("utf-8", errors="ignore")
# Remove the watermark text
updated_stream = raw_stream.replace(watermark_text, "")
# Convert back to bytes and update
new_stream = StreamObject()
new_stream._data = updated_stream.encode("utf-8")
page[NameObject("/Contents")] = new_stream
# Add the modified page to the writer
writer.add_page(page)
except Exception as e:
print(f"Error on page {page_num + 1}: {e}")
continue
# Save the modified PDF
with open(output_pdf, "wb") as f:
writer.write(f)
print(f"Watermark '{watermark_text}' removed and saved as {output_pdf}")
# Replace these paths with your actual file paths
input_pdf_path = "/path/to/pdf.pdf"
output_pdf_path = "/path/to/new.pdf"
remove_watermark(input_pdf_path, output_pdf_path)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment