Skip to content

Instantly share code, notes, and snippets.

@alexstorer
Created November 4, 2013 20:02
Show Gist options
  • Save alexstorer/7308404 to your computer and use it in GitHub Desktop.
Save alexstorer/7308404 to your computer and use it in GitHub Desktop.
Finnish Parliament Parsing
# Import statements - what libraries will we use?
from lxml import etree
from lxml import html
import urllib2, signal, time, random
# These are helper functions for handling network errors
def handler(signum, frame):
raise Exception('Last Resort!')
signal.signal(signal.SIGALRM, handler)
parser = etree.HTMLParser()
# def tryOpenParse(thisurl):
# global parser
# #print "trying to parse " + thisurl
# print "trying to parse..."
# loaded = False
# counter = 0
# tree = None
# while ((not loaded) and (counter<5)):
# if counter>0:
# print "Failed so far: ", counter
# print thisurl
# if counter>10:
# print thisurl
# return None
# try:
# counter+=1
# signal.alarm(5)
# u = urllib2.urlopen(thisurl)
# tree = etree.parse(u, parser)
# signal.alarm(0)
# loaded = True
# except Exception,e:
# print e
# signal.alarm(0)
# time.sleep(5*random.random())
# print "...done!"
# return tree
def tryOpenParse(thisurl):
#print "trying to parse " + thisurl
print "trying to parse..."
loaded = False
counter = 0
tree = None
while ((not loaded) and (counter<5)):
if counter>0:
print "Failed so far: ", counter
print thisurl
if counter>10:
print thisurl
return None
try:
counter+=1
signal.alarm(5)
#u = urllib2.urlopen(thisurl)
#tree = html.parse(u)
u = urllib2.urlopen(thisurl)
tree = html.document_fromstring(u.read())
signal.alarm(0)
loaded = True
except Exception,e:
print e
signal.alarm(0)
time.sleep(5*random.random())
print "...done!"
return tree
######################################
# functions for parsing
peoplelinks = {}
allsessions = []
allpeople = []
def getNames(url):
global baseurl
global peoplelinks
global allsessions
ftree = tryOpenParse(url)
if len(ftree.xpath('//frame'))>0:
ntree = tryOpenParse(baseurl + ftree.xpath('//frame/@src')[0])
else:
ntree = ftree
ntitle = ''.join(ntree.xpath('//a[@name="alkuun"]//strong/text()')).strip().encode('utf-8')
people = ntree.xpath('/html/body/table[2]//tr/td[1]//a/text()')
dates = ntree.xpath('/html/body/table[2]//tr/td[2]/font/text()')
links = ntree.xpath('/html/body/table[2]//tr/td[1]//a/@href')
if len(people)==len(dates) and len(dates)==len(links):
for (p,d,l) in zip(people,dates,links):
pnorm = p.strip().encode('utf-8')
pnorm = pnorm.replace('&nbsp','')
allsessions.append({'session':ntitle,'person':pnorm,'dates':d.strip()})
peoplelinks[pnorm] = baseurl+l
else:
print "Different numbers of table entries in ", ntitle
print "Check url: ", url
nextlink = ntree.xpath('//img[@alt="Seuraava sivu"]/ancestor::a/@href')
if len(nextlink)>0:
getNames(baseurl + nextlink[0])
def getPerson(url):
global allpeople
ftree = tryOpenParse(url)
if len(ftree.xpath('//frame'))>0:
ntree = tryOpenParse(baseurl + ftree.xpath('//frame/@src')[3])
else:
ntree = ftree
thisperson = {}
thisperson['Header'] = ' '.join(ntree.xpath('//b/text()'))
# only if there is a second column do we get the text of the row
for r in ntree.xpath('//tr/td[2]/ancestor::tr'):
rstr = r.text_content()
rsplit = rstr.split(':')
thisperson[rsplit[0].strip().encode('utf-8')] = rsplit[1].strip().encode('utf-8')
return thisperson
# Get the URL with the data we need
#ind = 'http://www.eduskunta.fi/thwfakta/hetekau/hex/hxent.htm'
ind = 'http://www.eduskunta.fi/thwfakta/tilasto/heteka/hx6700-su.htm'
baseurl = 'http://www.eduskunta.fi/'
tree = tryOpenParse(ind)
# These are all of the link elements with text including Vaalikausi
vlinks = tree.xpath("//a[contains(text(),'Vaalikausi')]/@href")
for v in vlinks:
print '*******************************************'
print 'Trying next...', v
print '*******************************************'
getNames(baseurl+v)
for p in peoplelinks:
print '*******************************************'
print 'Trying next...', p
print '*******************************************'
allpeople.append(getPerson(peoplelinks[p]))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment