Created
March 18, 2023 02:56
-
-
Save stefanschmidt/d8ce63673f0e7c039dfb2b5352285d04 to your computer and use it in GitHub Desktop.
Remove link borders and watermark from arXiv papers
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pikepdf | |
import re | |
def remove_link_borders(pdf): | |
'''remove link borders on all pages''' | |
removed = 0 | |
for p, page in enumerate(pdf.pages): | |
if '/Annots' in page: | |
for a, annot in enumerate(page['/Annots']): | |
page['/Annots'][a]['/BS']['/W'] = 0 | |
removed = removed + 1 | |
print(str(removed) + ' link borders removed') | |
return pdf | |
def remove_watermark(pdf): | |
'''remove the watermark on the first page''' | |
'''e.g. arXiv:2108.07732v1 [cs.PL] 16 Aug 2021''' | |
'''see https://info.arxiv.org/help/arxiv_identifier_for_services.html''' | |
watermark = r'^arXiv:.*\d{4}$' | |
page = pdf.pages[0] | |
commands = [] | |
for operands, operator in pikepdf.parse_content_stream(page): | |
commands.append([operands, operator]) | |
for c, command in enumerate(commands): | |
operands = command[0] | |
if operands and isinstance(operands[0], pikepdf.String): | |
if re.search(watermark, str(operands[0])): | |
del commands[c] | |
print('watermark removed') | |
new_content_stream = pikepdf.unparse_content_stream(commands) | |
page.Contents = pdf.make_stream(new_content_stream) | |
return pdf | |
input_pdf = 'input.pdf' | |
output_pdf = 'output.pdf' | |
pdf = pikepdf.Pdf.open(input_pdf) | |
pdf = remove_link_borders(pdf) | |
pdf = remove_watermark(pdf) | |
print('saving to ' + output_pdf) | |
pdf.save(output_pdf) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment