Last active
December 15, 2015 06:39
-
-
Save infinity0/5217482 to your computer and use it in GitHub Desktop.
Conception connection, inspired by http://www.smbc-comics.com/index.php?db=comics&id=2922#comic
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
# | |
# Usage: | |
# | |
# $ ./conception.py 1969-12-25 | |
# searching for most recent event before 1969-04-03 | |
# <li><a href="/wiki/April_1" title="April 1">April 1</a> | |
# – The <a href="/wiki/Hawker_Siddeley_Harrier" title="Hawker Siddeley Harrier">Hawker Siddeley Harrier</a> enters | |
# service with the <a href="/wiki/Royal_Air_Force" title="Royal Air Force">Royal Air Force</a>.</li> | |
# | |
# If python complains about ImportError, install tidy/lxml python libs | |
# | |
# $ aptitude install python-tidylib python-lxml | |
from bisect import bisect | |
from lxml import etree | |
import os.path | |
from functools import partial | |
import sys | |
import tidy | |
import time | |
import urllib2 | |
# TODO: better estimate from your birth weight and location | |
AV_PREG_LENGTH=266*24*3600 | |
TIDY_OPTS = dict(output_xhtml=1, | |
add_xml_decl=1, | |
tidy_mark=0) | |
def get_content(url): | |
# http://stackoverflow.com/questions/120061/fetch-a-wikipedia-article-with-python | |
opener = urllib2.build_opener() | |
opener.addheaders = [('User-agent', 'Mozilla/5.0')] | |
infile = opener.open(url) | |
text = infile.read() | |
return etree.HTML(str(tidy.parseString(text, **TIDY_OPTS))) | |
def md_to_key(d): | |
return "%02d.%02d" % (d.tm_mon, d.tm_mday) | |
def parse_li(parent, b_i, e_i, l): | |
try: | |
# ensure node is inside "events" section | |
p = l | |
while p.getparent() != parent: | |
p = p.getparent() | |
assert p.getparent() == parent | |
i = parent.index(p) | |
if not (b_i < i < e_i): return None | |
n = l.find(".//a[@href]") | |
s = os.path.basename(n.get("href")) | |
d = time.strptime(s, "%B_%d") | |
return (md_to_key(d), l) | |
except Exception, e: | |
return None | |
def get_events(y): | |
dom = get_content("http://en.wikipedia.org/wiki/%s" % y) | |
# find delimiters of "events" section | |
eventsh2 = dom.find(".//span[@id='Events']/..") | |
nexth2 = eventsh2 | |
while True: | |
nexth2 = nexth2.getnext() | |
if nexth2.tag == 'h2': break | |
parent = eventsh2.getparent() | |
assert parent == nexth2.getparent() | |
b_i = parent.index(eventsh2) | |
e_i = parent.index(nexth2) | |
return sorted(filter(None, map(partial(parse_li, parent, b_i, e_i), dom.findall(".//li")))) | |
if __name__ == '__main__': | |
d = time.localtime(time.mktime(time.strptime(sys.argv[1], "%Y-%m-%d")) - AV_PREG_LENGTH) | |
print time.strftime("searching for most recent event before %Y-%m-%d", d) | |
dd = get_events(d.tm_year) | |
i = bisect(dd, (md_to_key(d), None)) | |
if i > 0: | |
ev = dd[i-1] | |
else: | |
# need to check previous year | |
dd = get_events(d.tm_year - 1) | |
ev = dd[-1] | |
print etree.tostring(ev[1]) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment