Skip to content

Instantly share code, notes, and snippets.

@Dragost
Last active August 17, 2024 13:14
Show Gist options
  • Save Dragost/4b58cc53de8ebca75a8fbef87c43c0ba to your computer and use it in GitHub Desktop.
Save Dragost/4b58cc53de8ebca75a8fbef87c43c0ba to your computer and use it in GitHub Desktop.
PDF Manual Downloader Script <manualpdf.es>
import os
import re
import shutil
import jinja2
import requests
import progressbar
from functools import partial
from pypdf import PdfMerger
from multiprocessing import Pool
from InquirerPy import prompt
from pyhtml2pdf import converter
TEMP_FOLDER = 'temp'
PRINT_TEMPLATE = """<html><head><link rel="stylesheet" href="https://www.manualpdf.es/css/base.css"><link rel="stylesheet" href="https://www.manualpdf.es/viewer/{{file_id}}/{{page}}/page.css"></head><body><a name="{{page}}"></a><div class="viewer-page"><div style="transform:scale(1.95309)" class="page-{{page}} pf w0 h0">{{content}}</div></div></body></html>"""
def create_folder_if_not_exists(folder: str) -> None:
"""Create folder if not exists"""
if not os.path.exists(folder):
os.makedirs(folder)
def sanitize(input_string: str) -> str:
"""
Sanitize the input string by replacing '/' with '_'
and removing any unwanted characters.
Args:
input_string (str): The string to be sanitized.
Returns:
str: The sanitized string.
"""
# Replace '/' with '_'
sanitized_string = input_string.replace('/', '_')
# Remove any characters that are not alphanumeric or underscores
sanitized_string = re.sub(r'[^\w_]', '', sanitized_string)
return sanitized_string
def get_manual_url() -> str:
"""Prompt input for Manual PDF url"""
url_question = [{
'type': 'input',
'name': 'url',
'message': 'Enter Manual PDF url:',
}]
url_answer = prompt(url_question)
return url_answer.get('url').split('#')[0].split('?')[0]
def get_data(url: str) -> dict:
"""Process url and return a dictionary with the data"""
html = requests.get(url).text
file_id = re.search(r'viewer/([\d/]+)/1/bg1', html).group(1)
pages = re.search(r'<title>(.*)\(.*?(\d+).*?\)</title>', html)
title = pages.group(1).strip()
total_pages = int(pages.group(2))
return dict(file_id=file_id, title=title, total_pages=total_pages)
def get_html_page(file_id: str, p: int) -> str:
"""Get html page from manualpdf.es"""
url_page = f"https://www.manualpdf.es/viewer/{file_id}/{p}/"
# url return file, download it and read it
content = requests.get(f"{url_page}page-{p}.page").text
# replace relative links to absolute links
content = content.replace('src="', f'src="{url_page}')
return content
def generate_page(file_id: str, page: int, content: str, path: str,
landscape: bool):
"""Generate html page with jinja2 template"""
template = jinja2.Template(PRINT_TEMPLATE)
html = template.render(file_id=file_id, page=page, content=content)
# Save html page
file_name = f'{sanitize(file_id)}_{page:04}.html'
with open(path + '/' + file_name, 'w', encoding='utf-8') as f:
f.write(html)
generate_pdf(path, file_name, landscape)
def generate_pdf(path: str, file_name: str, landscape: bool = False):
"""Generate PDF from html"""
apath = os.path.abspath(path + '/' + file_name)
out_name = file_name.split('.')[0] + '.pdf'
print_opt = {"pageRanges": "1", "landscape": landscape}
converter.convert(f'file:///{apath}',
path + '/' + out_name,
print_options=print_opt)
def join_pdf_pages(path: str, file_id: str, title: str, out_path: str):
"""Join all pdf pages in a single pdf file"""
pdfs = [path + '/' + f for f in os.listdir(path) if f.endswith('.pdf')]
pdfs.sort()
merger = PdfMerger()
for pdf in pdfs:
merger.append(pdf)
title = re.sub(r'[^\w\s]', '', title).replace(' ', '_')
out_file_path = out_path + '/' + f'{sanitize(file_id)}_{title}.pdf'
merger.write(out_file_path)
merger.close()
return out_file_path
def delete_temp_folder():
"""Delete temp folder if all ok"""
ok_question = [{
'type': 'confirm',
'name': 'ok',
'message': '¿All ok? Delete temp folder?',
'default': True
}]
ok_answer = prompt(ok_question)
if ok_answer.get('ok'):
shutil.rmtree(TEMP_FOLDER)
def process_page(file_id: str, page: int, wpath: str, landscape: bool):
"""Download and process a single page"""
content = get_html_page(file_id, page)
generate_page(file_id, page, content, wpath, landscape)
return page
if __name__ == '__main__':
# Create temp folder if not exists
wpath = os.path.abspath(TEMP_FOLDER)
create_folder_if_not_exists(wpath)
# Enter url
url = get_manual_url()
# Get data from url
try:
pdf_data = get_data(url)
file_id = pdf_data['file_id']
except Exception:
print('Error: pdf data not found')
exit()
# Ask continue downloading file
print(f'{pdf_data["title"]} with {pdf_data["total_pages"]} pages')
continue_question = [{
'type': 'confirm',
'name': 'continue',
'message': f'Continue downloading file?',
'default': True
}]
continue_answer = prompt(continue_question)
if not continue_answer.get('continue'):
exit()
# Create file_id folder
wpath = wpath + f'/{sanitize(file_id)}'
create_folder_if_not_exists(wpath)
# Files in temp folder for skip already downloaded pages
generated_files = [f for f in os.listdir(wpath) if f.endswith('.pdf')]
# Ask for landscape
landscape_question = [{
'type': 'confirm',
'name': 'landscape',
'message': '¿Landscape?',
'default': False
}]
landscape_answer = prompt(landscape_question)
landscape = landscape_answer.get('landscape')
# Ask for multiprocessing
multiprocessing_question = [{
'type': 'confirm',
'name': 'multiprocessing',
'message': '¿Multiprocessing?',
'default': True
}]
multiprocessing_answer = prompt(multiprocessing_question)
if multiprocessing_answer.get('multiprocessing'):
# Use multiprocessing to download and process pages in parallel
total_pages = pdf_data['total_pages']
pages_to_process = [
page for page in range(1, total_pages + 1)
if f'{sanitize(file_id)}_{page:04}.pdf' not in generated_files
]
with progressbar.ProgressBar(max_value=len(pages_to_process)) as bar:
bar.update(0)
with Pool() as pool:
for i, _ in enumerate(
pool.imap_unordered(
partial(process_page,
file_id,
wpath=wpath,
landscape=landscape), pages_to_process),
1):
bar.update(i)
else:
with progressbar.ProgressBar(max_value=pdf_data['total_pages']) as bar:
bar.update(0)
for page in range(1, pdf_data['total_pages'] + 1):
# If pdf page already exists, skip it
if f'{sanitize(file_id)}_{page:04}.pdf' in generated_files:
bar.update(page - 1)
continue
# Generate html page
generate_page(file_id, page, get_html_page(file_id, page),
wpath, landscape)
bar.update(page - 1)
# Join all pdf pages in a single pdf file
out_path = os.path.abspath('output')
create_folder_if_not_exists(out_path)
out_file = join_pdf_pages(wpath, file_id, pdf_data['title'], out_path)
# Open pdf file
os.system(f'open {out_file}')
# Delete temp folder if ok
delete_temp_folder()

PDF Manual Downloader Script

This script is designed to download PDF manuals from wdhmedia websites. It can be run on any operating system that supports Python, maybe.

This is not the most correct or fastest way to download PDFs, but it works.

Tested with python 3.10 and Poetry.

Websites

Install

poetry install

Usage

❯ poetry run python main.py
? Enter Manual PDF url: https://www.manualpdf.es/ikea/renodlad/manual
Manual Ikea RENODLAD with 28 pages
? Continue downloading file? Yes
100% (28 of 28) |############################################| Elapsed Time: 0:00:00 Time:  0:00:00
? ¿All ok? Delete temp folder? Yes

Output

The downloaded PDF manuals will be saved in the outputfolder.

License

pdf_manual_downloaderis licensed under the GNU General Public License version 3.0.

[tool.poetry]
name = "pdf_manual_downloader"
version = "1.0.0"
description = "Descargador de PDFs de los chicos del maíz"
authors = ["Alberto <[email protected]>"]
license = "GPL"
[tool.poetry.dependencies]
python = "^3.10"
requests = "^2.28.2"
lxml = "^4.9.2"
inquirerpy = "^0.3.4"
jinja2 = "^3.1.2"
progressbar2 = "^4.2.0"
pypdf = "^3.5.0"
bs4 = "^0.0.1"
pyhtml2pdf = "^0.0.7"
@iiAlphaWolf
Copy link

Hello, I tried to follow every hint i could find but i keep getting errors onl ine 233. Please can you help me?

As far as i know i did exactly what @sylven said aswlel.

############################

ralph@ubuntu:~/Desktop/manuals$ poetry run python main.py ? Enter Manual PDF url: https://www.manua.ls/toyota/paseo-1992/manual User manual Toyota Paseo (1992) with 1061 pages ? Continue downloading file? Yes ? ¿Landscape? Yes ? ¿Multiprocessing? No 0% (1 of 1061) | | Elapsed Time: 0:00:07 ETA: 2:14:57 Traceback (most recent call last): File "/home/ralph/.cache/pypoetry/virtualenvs/pdf-manual-downloader-OBDX_1Vg-py3.12/lib/python3.12/site-packages/urllib3/connectionpool.py", line 789, in urlopen response = self._make_request( ^^^^^^^^^^^^^^^^^^^ File "/home/ralph/.cache/pypoetry/virtualenvs/pdf-manual-downloader-OBDX_1Vg-py3.12/lib/python3.12/site-packages/urllib3/connectionpool.py", line 536, in _make_request response = conn.getresponse() ^^^^^^^^^^^^^^^^^^ File "/home/ralph/.cache/pypoetry/virtualenvs/pdf-manual-downloader-OBDX_1Vg-py3.12/lib/python3.12/site-packages/urllib3/connection.py", line 464, in getresponse httplib_response = super().getresponse() ^^^^^^^^^^^^^^^^^^^^^ File "/usr/lib/python3.12/http/client.py", line 1428, in getresponse response.begin() File "/usr/lib/python3.12/http/client.py", line 331, in begin version, status, reason = self._read_status() ^^^^^^^^^^^^^^^^^^^ File "/usr/lib/python3.12/http/client.py", line 300, in _read_status raise RemoteDisconnected("Remote end closed connection without" http.client.RemoteDisconnected: Remote end closed connection without response

During handling of the above exception, another exception occurred:

Traceback (most recent call last): File "/home/ralph/.cache/pypoetry/virtualenvs/pdf-manual-downloader-OBDX_1Vg-py3.12/lib/python3.12/site-packages/requests/adapters.py", line 667, in send resp = conn.urlopen( ^^^^^^^^^^^^^ File "/home/ralph/.cache/pypoetry/virtualenvs/pdf-manual-downloader-OBDX_1Vg-py3.12/lib/python3.12/site-packages/urllib3/connectionpool.py", line 843, in urlopen retries = retries.increment( ^^^^^^^^^^^^^^^^^^ File "/home/ralph/.cache/pypoetry/virtualenvs/pdf-manual-downloader-OBDX_1Vg-py3.12/lib/python3.12/site-packages/urllib3/util/retry.py", line 474, in increment raise reraise(type(error), error, _stacktrace) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/home/ralph/.cache/pypoetry/virtualenvs/pdf-manual-downloader-OBDX_1Vg-py3.12/lib/python3.12/site-packages/urllib3/util/util.py", line 38, in reraise raise value.with_traceback(tb) File "/home/ralph/.cache/pypoetry/virtualenvs/pdf-manual-downloader-OBDX_1Vg-py3.12/lib/python3.12/site-packages/urllib3/connectionpool.py", line 789, in urlopen response = self._make_request( ^^^^^^^^^^^^^^^^^^^ File "/home/ralph/.cache/pypoetry/virtualenvs/pdf-manual-downloader-OBDX_1Vg-py3.12/lib/python3.12/site-packages/urllib3/connectionpool.py", line 536, in _make_request response = conn.getresponse() ^^^^^^^^^^^^^^^^^^ File "/home/ralph/.cache/pypoetry/virtualenvs/pdf-manual-downloader-OBDX_1Vg-py3.12/lib/python3.12/site-packages/urllib3/connection.py", line 464, in getresponse httplib_response = super().getresponse() ^^^^^^^^^^^^^^^^^^^^^ File "/usr/lib/python3.12/http/client.py", line 1428, in getresponse response.begin() File "/usr/lib/python3.12/http/client.py", line 331, in begin version, status, reason = self._read_status() ^^^^^^^^^^^^^^^^^^^ File "/usr/lib/python3.12/http/client.py", line 300, in _read_status raise RemoteDisconnected("Remote end closed connection without" urllib3.exceptions.ProtocolError: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))

During handling of the above exception, another exception occurred:

Traceback (most recent call last): File "/home/ralph/Desktop/manuals/main.py", line 223, in generate_page(file_id, page, get_html_page(file_id, page), ^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/home/ralph/Desktop/manuals/main.py", line 70, in get_html_page content = requests.get(f"{url_page}page-{p}.page").text ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/home/ralph/.cache/pypoetry/virtualenvs/pdf-manual-downloader-OBDX_1Vg-py3.12/lib/python3.12/site-packages/requests/api.py", line 73, in get return request("get", url, params=params, **kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/home/ralph/.cache/pypoetry/virtualenvs/pdf-manual-downloader-OBDX_1Vg-py3.12/lib/python3.12/site-packages/requests/api.py", line 59, in request return session.request(method=method, url=url, **kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/home/ralph/.cache/pypoetry/virtualenvs/pdf-manual-downloader-OBDX_1Vg-py3.12/lib/python3.12/site-packages/requests/sessions.py", line 589, in request resp = self.send(prep, **send_kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/home/ralph/.cache/pypoetry/virtualenvs/pdf-manual-downloader-OBDX_1Vg-py3.12/lib/python3.12/site-packages/requests/sessions.py", line 703, in send r = adapter.send(request, **kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/home/ralph/.cache/pypoetry/virtualenvs/pdf-manual-downloader-OBDX_1Vg-py3.12/lib/python3.12/site-packages/requests/adapters.py", line 682, in send raise ConnectionError(err, request=request) requests.exceptions.ConnectionError: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))

I got it working by changing the command to python3 instead of python:

poetry run python3 main.py

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment