Skip to content

Instantly share code, notes, and snippets.

@qgeissmann
Created November 23, 2020 13:43
Show Gist options
  • Save qgeissmann/a59104ceba9384712c8ed138a73de8da to your computer and use it in GitHub Desktop.
Save qgeissmann/a59104ceba9384712c8ed138a73de8da to your computer and use it in GitHub Desktop.
batch download guardian quick crosswords
import os
import datetime
import calendar
import requests
import tempfile
import atexit
import shutil
import PyPDF2
START_DATETIME = datetime.date(2020,1,1)
END_DATETIME = datetime.date(2020,6,1)
TARGET_DIR =
target = "x_words_%s_%s.pdf" % (str(START_DATETIME), str(END_DATETIME))
target = os.path.join(TARGET_DIR, target)
print(target)
template_url = 'https://crosswords-static.guim.co.uk/gdn.quick.%s.pdf'
s = START_DATETIME
tmp_dir = tempfile.mkdtemp(prefix='xwords')
atexit.register(shutil.rmtree, tmp_dir)
all_paths = []
while s < END_DATETIME:
s += datetime.timedelta(1)
weekday = calendar.weekday(s.year, s.month, s.day)
if weekday < 6:
url = template_url % s.strftime("%Y%m%d")
r = requests.get(url)
if r.status_code == 200:
path = os.path.join(tmp_dir, os.path.basename(url))
with open(path, 'wb') as f:
f.write(r.content)
print(s, 'OK')
all_paths.append(path)
else:
print(s, r)
merger = PyPDF2.PdfFileMerger()
for pdf in all_paths:
merger.append(open(pdf, 'rb'))
with open(target, "wb") as fout:
merger.write(fout)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment