fabienhinault · August 29, 2015 14:00
diff --git a/deputes.py b/deputes.py
 import urllib
 import os
 import lxml.html
 import re


 doc = lxml.html.parse("http://www.assemblee-nationale.fr/qui/xml/liste_alpha.asp?legislature=11").getroot()
 doc.make_links_absolute()
 birth = re.compile('[0-9][0-9]?e?r? \\S+ [0-9]{4}')
 name = re.compile('<[Hh]1[^>]*?>(.*?)</[Hh]1>')
 for d in doc.xpath('id("deputes")/li/a'):
 	str = urllib.urlopen(d.get('href')).read()
 	print "%s;%s"%(name.search(str).expand("\\1"),birth.search(str).group().replace("1er ","1 "))
diff --git a/deputes11.py b/deputes11.py
 import urllib
 import os
 import lxml.html
 import re


 doc = lxml.html.parse("http://www.assemblee-nationale.fr/qui/xml/liste_alpha.asp?legislature=11").getroot()
 doc.make_links_absolute()
 birth = re.compile('[0-9][0-9]?e?r? \\S+ [0-9]{4}')
 name = re.compile('<TITLE>(.*?)</TITLE>')
 for d in doc.xpath('id("deputes")/li/a'):
 	str = urllib.urlopen(d.get('href')).read()
 	try:
 		strBirth = birth.search(str).group().replace("1er ","1 ")
 	except:
 		strBirth = "?"
 	print "%s;%s"%(name.search(str).expand("\\1"),strBirth)
diff --git a/deputesWikipedia.py b/deputesWikipedia.py
 import urllib
 import os
 import lxml.html
 import re

 os.environ['http_proxy'] = "http://<proxy-url>:<port>"
 #doc = lxml.html.parse("http://fr.wikipedia.org/wiki/Liste_des_d%C3%A9put%C3%A9s_de_la_Ire_l%C3%A9gislature_de_la_Cinqui%C3%A8me_R%C3%A9publique").getroot()
 #doc = lxml.html.parse("http://en.wikipedia.org/wiki/Main_Page").getroot()
 #doc = lxml.html.parse("http://www.assemblee-nationale.fr/qui/xml/liste_alpha.asp?legislature=11").getroot()
 str = urllib.urlopen("http://en.wikipedia.org/wiki/Main_Page").read()
 print str
 doc.make_links_absolute()
 birth = re.compile('[0-9][0-9]?e?r? \\S+ [0-9]{4}')
 name = re.compile('<[Hh]1[^>]*?>(.*?)</[Hh]1>')
 for d in doc.xpath('/html/body/table/tr/td/a'):
 	print d
 #	str = urllib.urlopen(d.get('href')).read()
 #	print "%s;%s"%(name.search(str).expand("\\1"),birth.search(str).group().replace("1er ","1 "))
	import urllib
	import os
	import lxml.html
	import re


	doc = lxml.html.parse("http://www.assemblee-nationale.fr/qui/xml/liste_alpha.asp?legislature=11").getroot()
	doc.make_links_absolute()
	birth = re.compile('[0-9][0-9]?e?r? \\S+ [0-9]{4}')
	name = re.compile('<[Hh]1[^>]?>(.?)</[Hh]1>')
	for d in doc.xpath('id("deputes")/li/a'):
	str = urllib.urlopen(d.get('href')).read()
	print "%s;%s"%(name.search(str).expand("\\1"),birth.search(str).group().replace("1er ","1 "))