Created
March 9, 2023 00:05
-
-
Save stefanschmidt/1df53a95c05bbb936c75bd2ba48f8899 to your computer and use it in GitHub Desktop.
Remove all metadata and links matching a URL pattern from a PDF document
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pikepdf | |
import re | |
def remove_metadata(pdf): | |
'''remove all metadata''' | |
del pdf.Root.Metadata | |
print('XMP metadata removed') | |
del pdf.docinfo | |
print('document info metadata removed') | |
return pdf | |
def remove_link(pdf, url): | |
'''remove all links matching url pattern''' | |
removed = 0 | |
for p, page in enumerate(pdf.pages): | |
# TODO: use /Parent property instead of manual indexing | |
for a, annot in enumerate(page['/Annots']): | |
if annot['/A']['/S']=='/URI': | |
if re.search(url, str(annot['/A']['/URI'])): | |
del page['/Annots'][a] | |
removed = removed + 1 | |
print(str(removed) + ' links removed') | |
return pdf | |
def remove_text(pdf, txt): | |
'''remove all text matching txt pattern''' | |
removed = 0 | |
for page in pdf.pages: | |
commands = [] | |
for operands, operator in pikepdf.parse_content_stream(page): | |
commands.append([operands, operator]) | |
for command in commands: | |
operands = command[0] | |
if operands and isinstance(operands[0], pikepdf.String): | |
if re.search(txt, str(operands[0])): | |
operands.clear() | |
removed = removed + 1 | |
new_content_stream = pikepdf.unparse_content_stream(commands) | |
page.Contents = pdf.make_stream(new_content_stream) | |
print(str(removed) + ' text elements removed') | |
return pdf | |
# the actual link which is clickable but invisible | |
url = r'^http://www.foobar.com/$' | |
# the text element which is overlayed on the link | |
txt = r'^www.foobar.com$' | |
input_pdf = 'input.pdf' | |
output_pdf = 'output.pdf' | |
pdf = pikepdf.Pdf.open(input_pdf) | |
pdf = remove_metadata(pdf) | |
pdf = remove_link(pdf, url) | |
pdf = remove_text(pdf, txt) | |
print('saving to ' + output_pdf) | |
pdf.save(output_pdf) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment