Skip to content

Instantly share code, notes, and snippets.

@fracek
Created July 23, 2014 20:36
Show Gist options
  • Save fracek/106c60e99902ae97e247 to your computer and use it in GitHub Desktop.
Save fracek/106c60e99902ae97e247 to your computer and use it in GitHub Desktop.
# -*- coding: utf-8 -*-
# Fetch Barterzzaghi crosswords
import datetime
import os
import requests
import sys
import xml.etree.ElementTree as ET
URL = 'http://www.repubblica.it/static/rubriche/ilcruciverba/{0}/{1}.js'
DATE_FORMAT = '%d%m%Y'
OKAY = '✔'
NOPE = '✘'
ONE_WEEK = datetime.timedelta(days=7)
JS_TOKEN = 'var CrosswordPuzzleData = \"'
def str_to_datetime(s):
return datetime.datetime.strptime(s, DATE_FORMAT)
def datetime_to_str(dt):
return dt.strftime(DATE_FORMAT)
def url_for_day(dt):
year = dt.strftime('%Y')
filename = datetime_to_str(dt)
return URL.format(year, filename)
def n_xwords_starting(n, dt):
for i in range(1, n):
yield (url_for_day(dt), dt)
dt -= ONE_WEEK
def parse_js(s):
trimmed = s[len(JS_TOKEN):-2]
return trimmed.replace("\\\"", "\"")
def fetch_js(url):
r = requests.get(url)
if r.status_code == 200:
return r.content
else:
return None
LAST_KNOWN = str_to_datetime('19072014')
if __name__ == "__main__":
argv = os.sys.argv
if len(argv) != 2:
print 'Usage: {} out_dir'.format(argv[0])
sys.exit(1)
out_dir = argv[1]
if not os.path.exists(out_dir):
os.mkdir(out_dir)
urls = n_xwords_starting(15, LAST_KNOWN)
for url, day in urls:
print 'Crossword for day {}'.format(day.strftime('%d-%m%-Y'))
print(' Fetching '),
js = fetch_js(url)
if js is not None:
print OKAY
res = parse_js(js)
root = ET.fromstring(res)
tree = ET.ElementTree(root)
out = os.path.abspath('xwords/' + datetime_to_str(day) + '.xml')
tree.write(out)
else:
print NOPE
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment