Last active
September 23, 2024 13:39
-
-
Save trilobiet/2527a613351b8b37c3aaa88242bd1854 to your computer and use it in GitHub Desktop.
Load web pages from a list of urls and save as a single pdf file
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
''' | |
Read a list of urls, load the pages and save as a single pdf. | |
File urls.txt must be available alongside this script. | |
After running, resulting pdf is available as 'generated.pdf' | |
''' | |
from weasyprint import HTML | |
# cli: pip3 install weasyprint | |
from PyPDF2 import PdfMerger | |
# cli: pip3 install PyPDF2 | |
from io import BytesIO | |
def run(): | |
merger = PdfMerger(strict=False) | |
print("running...") | |
urls = open("urls.txt") | |
for idx, url in enumerate(urls.read().splitlines()): | |
pdf = pdfify(url) | |
merger.append(pdf) | |
print(f'processed {url}') | |
urls.close() | |
output = open("generated.pdf", "wb") | |
merger.write(output) | |
print("ready!") | |
def pdfify(url): | |
pdfout = HTML(url).write_pdf() # bytes | |
obj = BytesIO() | |
obj.write(pdfout) # File object | |
return obj | |
if __name__ == "__main__": | |
run() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment