-
-
Save rjw57/b9fbbd173d22aca42a80 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python3 | |
# | |
# THIS SCRIPT REQUIRES PYTHON 3 | |
# | |
# Install requirements via: | |
# pip3 install docopt pillow reportlab | |
# | |
# Dedicated to the public domain where possible. | |
# See: https://creativecommons.org/publicdomain/zero/1.0/ | |
""" | |
Download a pocketmags magazines in PDF format from the HTML5 reader. | |
Usage: | |
pmdown.py (-h | --help) | |
pmdown.py [options] <pdf> <url> | |
Options: | |
-h, --help Print brief usage summary. | |
--dpi=DPI Set image resolution in dots per inch. | |
[default: 150] | |
<pdf> Save output to this file. | |
<url> A URL to one image from the magazine. | |
Notes: | |
PLEASE USE THIS SCRIPT RESPONSIBLY. THE MAGAZINE PUBLISHING INDUSTRY RELIES | |
HEAVILY ON INCOME FROM SALES WITH VERY SLIM PROFIT MARGINS. | |
URLs for pocketmag images can be found by using the HTML 5 reader and | |
right-clicking on a page and selecting "inspect element". Look for URLs of | |
the form: | |
http://magazines.magazineclonercdn.com/<uuid1>/<uuid2>/high/<num>.jpg | |
where <uuid{1,2}> are strings of letters and numbers with dashes separating | |
them and <num> is some 4-digit number. | |
""" | |
import itertools | |
import re | |
from contextlib import contextmanager | |
from urllib.error import HTTPError | |
from urllib.parse import urlparse, urlunparse | |
from urllib.request import urlopen | |
import docopt | |
from PIL import Image | |
from reportlab.pdfgen import canvas | |
from reportlab.lib.units import inch | |
# The pattern of the URL path for a magazine | |
URL_PATH_PATTERN = re.compile(r'(?P<prefix>^[a-f0-9\-/]*/high/)[0-9]{4}.jpg') | |
@contextmanager | |
def saving(thing): | |
"""Context manager which ensures save() is called on thing.""" | |
try: | |
yield thing | |
finally: | |
thing.save() | |
def main(): | |
opts = docopt.docopt(__doc__) | |
pdf_fn, url = (opts[k] for k in ('<pdf>', '<url>')) | |
url = urlparse(url) | |
dpi = float(opts['--dpi']) | |
m = URL_PATH_PATTERN.match(url.path) | |
if not m: | |
raise RuntimeError('URL path does not match expected pattern') | |
prefix = m.group('prefix') | |
c = canvas.Canvas(pdf_fn) | |
with saving(c): | |
for page_num in itertools.count(0): | |
page_url = list(url) | |
page_url[2] = '{}{:04d}.jpg'.format(prefix, page_num) | |
page_url = urlunparse(page_url) | |
print('Downloading page {} from {}...'.format(page_num, page_url)) | |
try: | |
with urlopen(page_url) as f: | |
im = Image.open(f) | |
except HTTPError as e: | |
if e.code == 404: | |
print('No image found => stopping') | |
break | |
raise e | |
w, h = tuple(dim / dpi for dim in im.size) | |
print('Image is {:.2f}in x {:.2f}in at {} DPI'.format(w, h, dpi)) | |
c.setPageSize((w*inch, h*inch)) | |
c.drawInlineImage(im, 0, 0, w*inch, h*inch) | |
c.showPage() | |
if __name__ == '__main__': | |
main() |
so the default is currently "extralow" and we can change it to "mid" but does anyone know how to get the higher quality jpg?
I know there is a higher quality available but I tried "high", and "extrahigh" but it just gives an error page, anyone know the right directory name for the high quality images?
I've tried everything I can think of and I can't get a better quality than "mid." It's a shame, because when you download the allowed 2 pages via Pocketmags, the quality is far superior.
so the default is currently "extralow" and we can change it to "mid" but does anyone know how to get the higher quality jpg?
I know there is a higher quality available but I tried "high", and "extrahigh" but it just gives an error page, anyone know the right directory name for the high quality images?I've tried everything I can think of and I can't get a better quality than "mid." It's a shame, because when you download the allowed 2 pages via Pocketmags, the quality is far superior.
Perhaps the 2-page print is the solution🤔. My coding days were when BASIC was a new thing
and have progressed little since then but isn't it possible to write a code that reiteratively prints two pages at a time until all are done? Then we could combine those in one pdf pretty easily, I'd have thought.
let numberOfPages = 71;
for (let index = 0; index < numberOfPages; index += 2) {
document.getElementById('print_menu').click();
setTimeout(() => {
let pages = document.querySelectorAll('[pagenum="' + (index + 1) + '"]');
pages[0].click();
if (index + 2 <= numberOfPages)
{
pages = document.querySelectorAll('[pagenum="' + (index + 2) + '"]');
pages[0].click();
}
document.getElementById('printPages').click();
}, 500);
}
I've modified this script to enable downloading of magazines in "high" quality and have created an option to add a magazine title to the generated PDF's metadata. I've published my new version in a separate GitHub repo as Gists don't seem to support pull requests. You can find it here: https://github.com/RichardJRL/pocketmagstopdf
The original author, rjw57, is welcome to include my changes in his Gist here if he wishes
I've now further modified the script to download the whole magazine at the same quality that the restricted 2-page print option on the website offers.
As before, I've published my modified version on my GitHub page: https://github.com/RichardJRL/pocketmagstopdf
Python neophyte here. I was able to find the various IDs and to get the latest script running, but after finding the last good page of the mag, the script terminates with ERROR - Unable to download magazine: HTTP error code 405. Any guidance would be appreciated.
Sorry, new to Github, too. This is in reference to pocketmagstopdf. If I need to post elsewhere, please let me know.
Never mind. I'll post to the Issues of that repository.
how can i get the uuid1 and uuid2 for the magazine please.