Created
November 4, 2013 20:02
-
-
Save alexstorer/7308404 to your computer and use it in GitHub Desktop.
Finnish Parliament Parsing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # Import statements - what libraries will we use? | |
| from lxml import etree | |
| from lxml import html | |
| import urllib2, signal, time, random | |
| # These are helper functions for handling network errors | |
| def handler(signum, frame): | |
| raise Exception('Last Resort!') | |
| signal.signal(signal.SIGALRM, handler) | |
| parser = etree.HTMLParser() | |
| # def tryOpenParse(thisurl): | |
| # global parser | |
| # #print "trying to parse " + thisurl | |
| # print "trying to parse..." | |
| # loaded = False | |
| # counter = 0 | |
| # tree = None | |
| # while ((not loaded) and (counter<5)): | |
| # if counter>0: | |
| # print "Failed so far: ", counter | |
| # print thisurl | |
| # if counter>10: | |
| # print thisurl | |
| # return None | |
| # try: | |
| # counter+=1 | |
| # signal.alarm(5) | |
| # u = urllib2.urlopen(thisurl) | |
| # tree = etree.parse(u, parser) | |
| # signal.alarm(0) | |
| # loaded = True | |
| # except Exception,e: | |
| # print e | |
| # signal.alarm(0) | |
| # time.sleep(5*random.random()) | |
| # print "...done!" | |
| # return tree | |
| def tryOpenParse(thisurl): | |
| #print "trying to parse " + thisurl | |
| print "trying to parse..." | |
| loaded = False | |
| counter = 0 | |
| tree = None | |
| while ((not loaded) and (counter<5)): | |
| if counter>0: | |
| print "Failed so far: ", counter | |
| print thisurl | |
| if counter>10: | |
| print thisurl | |
| return None | |
| try: | |
| counter+=1 | |
| signal.alarm(5) | |
| #u = urllib2.urlopen(thisurl) | |
| #tree = html.parse(u) | |
| u = urllib2.urlopen(thisurl) | |
| tree = html.document_fromstring(u.read()) | |
| signal.alarm(0) | |
| loaded = True | |
| except Exception,e: | |
| print e | |
| signal.alarm(0) | |
| time.sleep(5*random.random()) | |
| print "...done!" | |
| return tree | |
| ###################################### | |
| # functions for parsing | |
| peoplelinks = {} | |
| allsessions = [] | |
| allpeople = [] | |
| def getNames(url): | |
| global baseurl | |
| global peoplelinks | |
| global allsessions | |
| ftree = tryOpenParse(url) | |
| if len(ftree.xpath('//frame'))>0: | |
| ntree = tryOpenParse(baseurl + ftree.xpath('//frame/@src')[0]) | |
| else: | |
| ntree = ftree | |
| ntitle = ''.join(ntree.xpath('//a[@name="alkuun"]//strong/text()')).strip().encode('utf-8') | |
| people = ntree.xpath('/html/body/table[2]//tr/td[1]//a/text()') | |
| dates = ntree.xpath('/html/body/table[2]//tr/td[2]/font/text()') | |
| links = ntree.xpath('/html/body/table[2]//tr/td[1]//a/@href') | |
| if len(people)==len(dates) and len(dates)==len(links): | |
| for (p,d,l) in zip(people,dates,links): | |
| pnorm = p.strip().encode('utf-8') | |
| pnorm = pnorm.replace(' ','') | |
| allsessions.append({'session':ntitle,'person':pnorm,'dates':d.strip()}) | |
| peoplelinks[pnorm] = baseurl+l | |
| else: | |
| print "Different numbers of table entries in ", ntitle | |
| print "Check url: ", url | |
| nextlink = ntree.xpath('//img[@alt="Seuraava sivu"]/ancestor::a/@href') | |
| if len(nextlink)>0: | |
| getNames(baseurl + nextlink[0]) | |
| def getPerson(url): | |
| global allpeople | |
| ftree = tryOpenParse(url) | |
| if len(ftree.xpath('//frame'))>0: | |
| ntree = tryOpenParse(baseurl + ftree.xpath('//frame/@src')[3]) | |
| else: | |
| ntree = ftree | |
| thisperson = {} | |
| thisperson['Header'] = ' '.join(ntree.xpath('//b/text()')) | |
| # only if there is a second column do we get the text of the row | |
| for r in ntree.xpath('//tr/td[2]/ancestor::tr'): | |
| rstr = r.text_content() | |
| rsplit = rstr.split(':') | |
| thisperson[rsplit[0].strip().encode('utf-8')] = rsplit[1].strip().encode('utf-8') | |
| return thisperson | |
| # Get the URL with the data we need | |
| #ind = 'http://www.eduskunta.fi/thwfakta/hetekau/hex/hxent.htm' | |
| ind = 'http://www.eduskunta.fi/thwfakta/tilasto/heteka/hx6700-su.htm' | |
| baseurl = 'http://www.eduskunta.fi/' | |
| tree = tryOpenParse(ind) | |
| # These are all of the link elements with text including Vaalikausi | |
| vlinks = tree.xpath("//a[contains(text(),'Vaalikausi')]/@href") | |
| for v in vlinks: | |
| print '*******************************************' | |
| print 'Trying next...', v | |
| print '*******************************************' | |
| getNames(baseurl+v) | |
| for p in peoplelinks: | |
| print '*******************************************' | |
| print 'Trying next...', p | |
| print '*******************************************' | |
| allpeople.append(getPerson(peoplelinks[p])) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment