alexstorer · November 4, 2013 20:02
diff --git a/gistfile1.py b/gistfile1.py
 # Import statements - what libraries will we use?
 from lxml import etree
 from lxml import html
 import urllib2, signal, time, random

 # These are helper functions for handling network errors
 def handler(signum, frame):
    raise Exception('Last Resort!')

 signal.signal(signal.SIGALRM, handler)
 parser = etree.HTMLParser()
    
 # def tryOpenParse(thisurl):
 #     global parser
 #     #print "trying to parse " + thisurl
 #     print "trying to parse..."
 #     loaded = False
 #     counter = 0
 #     tree = None
 #     while ((not loaded) and (counter<5)):
 #         if counter>0:
 #             print "Failed so far: ", counter
 #             print thisurl        
 #         if counter>10:
 #             print thisurl
 #             return None
 #         try:
 #             counter+=1
 #             signal.alarm(5)
 #             u = urllib2.urlopen(thisurl)            
 #             tree = etree.parse(u, parser)
 #             signal.alarm(0)
 #             loaded = True            
 #         except Exception,e:
 #             print e
 #             signal.alarm(0)            
 #             time.sleep(5*random.random())
 #     print "...done!"
 #     return tree

 def tryOpenParse(thisurl):
    #print "trying to parse " + thisurl
    print "trying to parse..."
    loaded = False
    counter = 0
    tree = None
    while ((not loaded) and (counter<5)):
        if counter>0:
            print "Failed so far: ", counter
            print thisurl        
        if counter>10:
            print thisurl
            return None
        try:
            counter+=1
            signal.alarm(5)
            #u = urllib2.urlopen(thisurl)
            #tree = html.parse(u)
            u = urllib2.urlopen(thisurl)
            tree = html.document_fromstring(u.read())
            signal.alarm(0)
            loaded = True            
        except Exception,e:
            print e
            signal.alarm(0)            
            time.sleep(5*random.random())
    print "...done!"
    return tree

    
 ######################################

 # functions for parsing

 peoplelinks = {}
 allsessions = []
 allpeople = []

 def getNames(url):
    global baseurl
    global peoplelinks
    global allsessions
    ftree = tryOpenParse(url)
    if len(ftree.xpath('//frame'))>0:
        ntree = tryOpenParse(baseurl + ftree.xpath('//frame/@src')[0])
    else:
        ntree = ftree
    ntitle = ''.join(ntree.xpath('//a[@name="alkuun"]//strong/text()')).strip().encode('utf-8')
    people = ntree.xpath('/html/body/table[2]//tr/td[1]//a/text()')
    dates = ntree.xpath('/html/body/table[2]//tr/td[2]/font/text()')
    links = ntree.xpath('/html/body/table[2]//tr/td[1]//a/@href')
    if len(people)==len(dates) and len(dates)==len(links):
        for (p,d,l) in zip(people,dates,links):
            pnorm = p.strip().encode('utf-8')
            pnorm = pnorm.replace('&nbsp','')
            allsessions.append({'session':ntitle,'person':pnorm,'dates':d.strip()})
            peoplelinks[pnorm] = baseurl+l
    else:
        print "Different numbers of table entries in ", ntitle
        print "Check url: ", url    
    nextlink = ntree.xpath('//img[@alt="Seuraava sivu"]/ancestor::a/@href')
    if len(nextlink)>0:
        getNames(baseurl + nextlink[0])

 def getPerson(url):
    global allpeople
    ftree = tryOpenParse(url)
    if len(ftree.xpath('//frame'))>0:
        ntree = tryOpenParse(baseurl + ftree.xpath('//frame/@src')[3])
    else:
        ntree = ftree
    thisperson = {}
    thisperson['Header'] = ' '.join(ntree.xpath('//b/text()'))
    # only if there is a second column do we get the text of the row
    for r in ntree.xpath('//tr/td[2]/ancestor::tr'):
        rstr = r.text_content()
        rsplit = rstr.split(':')
        thisperson[rsplit[0].strip().encode('utf-8')] = rsplit[1].strip().encode('utf-8')
    return thisperson
        
 # Get the URL with the data we need
 #ind = 'http://www.eduskunta.fi/thwfakta/hetekau/hex/hxent.htm'
 ind = 'http://www.eduskunta.fi/thwfakta/tilasto/heteka/hx6700-su.htm'
 baseurl = 'http://www.eduskunta.fi/'
 tree = tryOpenParse(ind)

 # These are all of the link elements with text including Vaalikausi
 vlinks = tree.xpath("//a[contains(text(),'Vaalikausi')]/@href")

 for v in vlinks:
    print '*******************************************'
    print 'Trying next...', v
    print '*******************************************'    
    getNames(baseurl+v)

 for p in peoplelinks:
    print '*******************************************'
    print 'Trying next...', p
    print '*******************************************'    
    allpeople.append(getPerson(peoplelinks[p]))
	# Import statements - what libraries will we use?
	from lxml import etree
	from lxml import html
	import urllib2, signal, time, random

	# These are helper functions for handling network errors
	def handler(signum, frame):
	raise Exception('Last Resort!')

	signal.signal(signal.SIGALRM, handler)
	parser = etree.HTMLParser()

	# def tryOpenParse(thisurl):
	# global parser
	# #print "trying to parse " + thisurl
	# print "trying to parse..."
	# loaded = False
	# counter = 0
	# tree = None
	# while ((not loaded) and (counter<5)):
	# if counter>0:
	# print "Failed so far: ", counter
	# print thisurl
	# if counter>10:
	# print thisurl
	# return None
	# try:
	# counter+=1
	# signal.alarm(5)
	# u = urllib2.urlopen(thisurl)
	# tree = etree.parse(u, parser)
	# signal.alarm(0)
	# loaded = True
	# except Exception,e:
	# print e
	# signal.alarm(0)
	# time.sleep(5*random.random())
	# print "...done!"
	# return tree

	def tryOpenParse(thisurl):
	#print "trying to parse " + thisurl
	print "trying to parse..."
	loaded = False
	counter = 0
	tree = None
	while ((not loaded) and (counter<5)):
	if counter>0:
	print "Failed so far: ", counter
	print thisurl
	if counter>10:
	print thisurl
	return None
	try:
	counter+=1
	signal.alarm(5)
	#u = urllib2.urlopen(thisurl)
	#tree = html.parse(u)
	u = urllib2.urlopen(thisurl)
	tree = html.document_fromstring(u.read())
	signal.alarm(0)
	loaded = True
	except Exception,e:
	print e
	signal.alarm(0)
	time.sleep(5*random.random())
	print "...done!"
	return tree


	######################################

	# functions for parsing

	peoplelinks = {}
	allsessions = []
	allpeople = []

	def getNames(url):
	global baseurl
	global peoplelinks
	global allsessions
	ftree = tryOpenParse(url)
	if len(ftree.xpath('//frame'))>0:
	ntree = tryOpenParse(baseurl + ftree.xpath('//frame/@src')[0])
	else:
	ntree = ftree
	ntitle = ''.join(ntree.xpath('//a[@name="alkuun"]//strong/text()')).strip().encode('utf-8')
	people = ntree.xpath('/html/body/table[2]//tr/td[1]//a/text()')
	dates = ntree.xpath('/html/body/table[2]//tr/td[2]/font/text()')
	links = ntree.xpath('/html/body/table[2]//tr/td[1]//a/@href')
	if len(people)==len(dates) and len(dates)==len(links):
	for (p,d,l) in zip(people,dates,links):
	pnorm = p.strip().encode('utf-8')
	pnorm = pnorm.replace('&nbsp','')
	allsessions.append({'session':ntitle,'person':pnorm,'dates':d.strip()})
	peoplelinks[pnorm] = baseurl+l
	else:
	print "Different numbers of table entries in ", ntitle
	print "Check url: ", url
	nextlink = ntree.xpath('//img[@alt="Seuraava sivu"]/ancestor::a/@href')
	if len(nextlink)>0:
	getNames(baseurl + nextlink[0])

	def getPerson(url):
	global allpeople
	ftree = tryOpenParse(url)
	if len(ftree.xpath('//frame'))>0:
	ntree = tryOpenParse(baseurl + ftree.xpath('//frame/@src')[3])
	else:
	ntree = ftree
	thisperson = {}
	thisperson['Header'] = ' '.join(ntree.xpath('//b/text()'))
	# only if there is a second column do we get the text of the row
	for r in ntree.xpath('//tr/td[2]/ancestor::tr'):
	rstr = r.text_content()
	rsplit = rstr.split(':')
	thisperson[rsplit[0].strip().encode('utf-8')] = rsplit[1].strip().encode('utf-8')
	return thisperson

	# Get the URL with the data we need
	#ind = 'http://www.eduskunta.fi/thwfakta/hetekau/hex/hxent.htm'
	ind = 'http://www.eduskunta.fi/thwfakta/tilasto/heteka/hx6700-su.htm'
	baseurl = 'http://www.eduskunta.fi/'
	tree = tryOpenParse(ind)

	# These are all of the link elements with text including Vaalikausi
	vlinks = tree.xpath("//a[contains(text(),'Vaalikausi')]/@href")

	for v in vlinks:
	print '*******************************************'
	print 'Trying next...', v
	print '*******************************************'
	getNames(baseurl+v)

	for p in peoplelinks:
	print '*******************************************'
	print 'Trying next...', p
	print '*******************************************'
	allpeople.append(getPerson(peoplelinks[p]))