Skip to content

Instantly share code, notes, and snippets.

@trilobiet
Last active September 23, 2024 13:39
Show Gist options
  • Save trilobiet/2527a613351b8b37c3aaa88242bd1854 to your computer and use it in GitHub Desktop.
Save trilobiet/2527a613351b8b37c3aaa88242bd1854 to your computer and use it in GitHub Desktop.
Load web pages from a list of urls and save as a single pdf file
'''
Read a list of urls, load the pages and save as a single pdf.
File urls.txt must be available alongside this script.
After running, resulting pdf is available as 'generated.pdf'
'''
from weasyprint import HTML
# cli: pip3 install weasyprint
from PyPDF2 import PdfMerger
# cli: pip3 install PyPDF2
from io import BytesIO
def run():
merger = PdfMerger(strict=False)
print("running...")
urls = open("urls.txt")
for idx, url in enumerate(urls.read().splitlines()):
pdf = pdfify(url)
merger.append(pdf)
print(f'processed {url}')
urls.close()
output = open("generated.pdf", "wb")
merger.write(output)
print("ready!")
def pdfify(url):
pdfout = HTML(url).write_pdf() # bytes
obj = BytesIO()
obj.write(pdfout) # File object
return obj
if __name__ == "__main__":
run()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment