-
-
Save apetenchea/4df556a49f9a2543be877c31355b4164 to your computer and use it in GitHub Desktop.
# This script gathers all the pages of a manual and merges them into a PDF. | |
# You'll need to play a bit with inspect-element in order to figure out the format the correct url, | |
# but it should be easy to adapt it to any manual. | |
# This script is specifically for https://www.manua.ls/audi/q3-2018/manual. | |
# Their url format is https://www.manua.ls/viewer/{manual-id}/{page-number}/bg{page-number-hex}.png | |
# Example: https://www.manua.ls/viewer/668006/100/bg64.png | |
# Enjoy! | |
import requests | |
from tqdm import tqdm | |
from PIL import Image | |
from io import BytesIO | |
from reportlab.pdfgen import canvas | |
from reportlab.lib.pagesizes import letter | |
from reportlab.lib.utils import ImageReader | |
def download_image(url): | |
response = requests.get(url) | |
if response.status_code == 200: | |
return Image.open(BytesIO(response.content)) | |
else: | |
print(f"Failed to download {url}") | |
return None | |
def save_images_as_pdf(images, pdf_filename): | |
c = canvas.Canvas(pdf_filename, pagesize=letter) | |
width, height = letter | |
for image in images: | |
image_width, image_height = image.size | |
aspect_ratio = image_width / image_height | |
new_width = width | |
new_height = width / aspect_ratio | |
if new_height > height: | |
new_height = height | |
new_width = height * aspect_ratio | |
# Convert PIL image to byte stream | |
img_byte_arr = BytesIO() | |
image.save(img_byte_arr, format='PNG') | |
img_byte_arr.seek(0) | |
# Draw image from byte stream | |
c.drawImage(ImageReader(img_byte_arr), 0, height - new_height, width=new_width, height=new_height) | |
c.showPage() | |
c.save() | |
def main(): | |
base_url = "https://www.manua.ls/viewer/668006/" | |
images = [] | |
for i in tqdm(range(1, 231)): # Adjust the range as needed | |
url = f"{base_url}{i}/bg{hex(i)[2:]}.png" | |
image = download_image(url) | |
if image: | |
images.append(image) | |
if images: | |
save_images_as_pdf(images, "output.pdf") | |
print("PDF created successfully") | |
else: | |
print("No images downloaded") | |
if __name__ == "__main__": | |
main() |
# Use this script for webp manuals | |
# example: https://www.manua.ls/growatt/min-3000-11400tl-xh-us/manual?p=1 | |
# pip install selenium webdriver-manager pillow tqdm | |
# By default Firefox is used, but it's easy to adapt to chrome, see below | |
""" | |
from selenium.webdriver.chrome.options import Options | |
from selenium.webdriver.chrome.service import Service | |
from webdriver_manager.chrome import ChromeDriverManager | |
options = Options() | |
options.headless = True | |
options.add_argument("--window-size=1200,1600") | |
driver = webdriver.Chrome(options=options) | |
""" | |
from selenium import webdriver | |
from selenium.webdriver.firefox.options import Options | |
from selenium.webdriver.firefox.service import Service | |
from webdriver_manager.firefox import GeckoDriverManager | |
from selenium.webdriver.common.by import By | |
from selenium.webdriver.support.ui import WebDriverWait | |
from selenium.webdriver.support import expected_conditions as EC | |
from PIL import Image | |
from tqdm import tqdm | |
import io | |
import time | |
def get_screenshot(driver, url, consent): | |
driver.get(url) | |
# Wait for consent and give time for JS to load elements | |
if consent: | |
try: | |
consent_button = WebDriverWait(driver, 3).until( | |
EC.element_to_be_clickable((By.XPATH, '//button[@aria-label="Consent"]')) | |
) | |
consent_button.click() | |
except: | |
pass | |
else: | |
time.sleep(1) | |
viewer_div = driver.find_element(By.ID, "viewer") | |
# Save screenshot of just one element | |
png = viewer_div.screenshot_as_png | |
# Optional, intermediary save step | |
# viewer_div.screenshot(f"{url[-1]}.png") | |
return Image.open(io.BytesIO(png)) | |
def main(): | |
options = Options() | |
options.headless = True | |
options.set_preference("layout.css.devPixelsPerPx", "1.5") | |
driver = webdriver.Firefox(options=options) | |
base_url = "https://www.manua.ls" | |
images = [] | |
try: | |
for i in tqdm(range(1, 82)): # number of pages 81 | |
url = f"{base_url}/growatt/min-3000-11400tl-xh-us/manual?p={i}" # manual name may differ | |
img = get_screenshot(driver, url, consent=(i == 1)) | |
if img: | |
images.append(img) | |
finally: | |
driver.quit() | |
if images: | |
images[0].save("output.pdf", save_all=True, append_images=images[1:]) | |
print("PDF created successfully") | |
else: | |
print("No screenshots taken") | |
driver.quit() | |
if __name__ == "__main__": | |
main() |
it seems they may have gotten more clever recently. the images download fine, but any text elements on the page are now part of separate div classes and so are not downloaded.
an example with lots of text elements: https://www.manua.ls/growatt/min-3000-11400tl-xh-us/manual?p=44
Another trick is to use https://www.manualslib.com/ instead which allows PDF downloading ;)
they dont have this 81 page manual
Hmm, oh yeah I didn't check for your specific case. I just solved mine 30min ago and wanted to share the tip but unfortunate for you :/
it seems they may have gotten more clever recently. the images download fine, but any text elements on the page are now part of separate div classes and so are not downloaded.
an example with lots of text elements: https://www.manua.ls/growatt/min-3000-11400tl-xh-us/manual?p=44
Apparently they can do that for webp manuals. The good news is that the url keeps increasing as a counter https://www.manua.ls/growatt/min-3000-11400tl-xh-us/manual?p=50
, then ?p=51
and so on.
The quick and dirty option is to use a webdriver (eg selenium) and take screenshots of the "viewer" element. That would probably take some time, so you might want to grab a coffee or let it run overnight, until you let your computer do the work. I estimate about 30 min for this 80 page manual.
ill look into that, thanks.
I played a bit with https://gist.github.com/apetenchea/4df556a49f9a2543be877c31355b4164#file-webp-manuals-py
It should do the job, if you have the patience to wait for it.
thanks, ill mess with that
this is great. i made two changes:
viewer_div = driver.find_element(By.CLASS_NAME, "viewer-page")
doing it this way removes the viewer UI (the arrows still show. i know selenium can hide elements, but this is good enough for what i need)
and
options.add_argument("--headless")
the other headless method works for chrome but not FF.
Works beautifully. Thank you.