Created
April 26, 2016 07:11
-
-
Save msgre/5e8d32e266bf8ff85a1063003d4796d5 to your computer and use it in GitHub Desktop.
Homeworks parser
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
import re | |
import sys | |
import requests | |
import simplejson | |
from datetime import date | |
from pyquery import PyQuery as pq | |
SITE_URL = 'http://www.zerotinova4b.estranky.cz' | |
TERM_RE = re.compile(r'(?P<day_from>\d+)\s*\.\s*(?P<month_from>\d+)\s*\.\s*-\s*(?P<day_to>\d+)\s*\.\s*(?P<month_to>\d+)\s*\.\s*(?P<year_to>\d+)', re.UNICODE) | |
FMT = '%Y-%m-%d' | |
def get_last_plan(): | |
""" | |
Vleze na prehled clanku zarazenych do kategorie "Tydenni plan" | |
a vytahne z nejcerstvejsiho zaznamu odkaz na detailni stranku. | |
Vraci: retezec, URL na detailni stranku | |
""" | |
r = requests.get('%s/clanky/tydenni-plany/' % SITE_URL) | |
d = pq(r.content) | |
anchors = d('#articles div.article h2 a') | |
if not anchors: | |
return None | |
return anchors[0].attrib['href'] | |
def parse_term(title): | |
""" | |
Vyparsuje z titulku tydenniho planu obdobi, pro ktere jsou ukoly | |
zadany. | |
Vraci: slovnik se strukturou | |
'date_from': '2014-05-12' | |
'date_to': '2014-05-18' | |
""" | |
m = TERM_RE.search(title) | |
if m: | |
data = m.groupdict() | |
date_from = date(int(data['year_to']), int(data['month_from']), int(data['day_from'])) | |
date_to = date(int(data['year_to']), int(data['month_to']), int(data['day_to'])) | |
else: | |
date_from = None | |
date_to = None | |
return {'date_from': date_from, 'date_to': date_to} | |
def parse_plan(url): | |
""" | |
Vyparsuje z detailni stranky seznam ukolu a obdobi. | |
Vraci: slovnik se strukturou | |
'term': retezec s nadpisem, ve kterem je info o obdobi | |
'homeworks': seznam tupliku ('predmet', 'ukol') | |
""" | |
r = requests.get('%s%s' % (SITE_URL, url)) | |
d = pq(r.content) | |
rows = d('div.article table tr') | |
if not rows: | |
return None | |
homeworks = [dict(zip(['type', 'msg'], [y.text.strip() for y in i.findall('td')])) | |
for i in rows if len(i.findall('td')) > 0] | |
title = d('div.article h2 span.span-a-title') | |
title = title[0].text if len(title) else '' | |
term = parse_term(title) | |
return { | |
'homeworks': homeworks, | |
'date_from': term['date_from'].strftime(FMT), | |
'date_to': term['date_to'].strftime(FMT) | |
} | |
def save_json(filename, data): | |
""" | |
Ulozi vyparsovana data jako JSON zadaneho jmena. | |
""" | |
with open(filename, "wt") as f: | |
simplejson.dump(data, f) | |
if __name__ == "__main__": | |
url = get_last_plan() | |
data = parse_plan(url) | |
save_json('ukoly.json', data) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment