Skip to content

Instantly share code, notes, and snippets.

@stefanschmidt
Created March 18, 2023 02:56
Show Gist options
  • Save stefanschmidt/d8ce63673f0e7c039dfb2b5352285d04 to your computer and use it in GitHub Desktop.
Save stefanschmidt/d8ce63673f0e7c039dfb2b5352285d04 to your computer and use it in GitHub Desktop.
Remove link borders and watermark from arXiv papers
import pikepdf
import re
def remove_link_borders(pdf):
'''remove link borders on all pages'''
removed = 0
for p, page in enumerate(pdf.pages):
if '/Annots' in page:
for a, annot in enumerate(page['/Annots']):
page['/Annots'][a]['/BS']['/W'] = 0
removed = removed + 1
print(str(removed) + ' link borders removed')
return pdf
def remove_watermark(pdf):
'''remove the watermark on the first page'''
'''e.g. arXiv:2108.07732v1 [cs.PL] 16 Aug 2021'''
'''see https://info.arxiv.org/help/arxiv_identifier_for_services.html'''
watermark = r'^arXiv:.*\d{4}$'
page = pdf.pages[0]
commands = []
for operands, operator in pikepdf.parse_content_stream(page):
commands.append([operands, operator])
for c, command in enumerate(commands):
operands = command[0]
if operands and isinstance(operands[0], pikepdf.String):
if re.search(watermark, str(operands[0])):
del commands[c]
print('watermark removed')
new_content_stream = pikepdf.unparse_content_stream(commands)
page.Contents = pdf.make_stream(new_content_stream)
return pdf
input_pdf = 'input.pdf'
output_pdf = 'output.pdf'
pdf = pikepdf.Pdf.open(input_pdf)
pdf = remove_link_borders(pdf)
pdf = remove_watermark(pdf)
print('saving to ' + output_pdf)
pdf.save(output_pdf)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment