Skip to content

Instantly share code, notes, and snippets.

@fabienhinault
Last active August 29, 2015 14:00
Show Gist options
  • Save fabienhinault/4f01b2320a821553add1 to your computer and use it in GitHub Desktop.
Save fabienhinault/4f01b2320a821553add1 to your computer and use it in GitHub Desktop.
import urllib
import os
import lxml.html
import re
doc = lxml.html.parse("http://www.assemblee-nationale.fr/qui/xml/liste_alpha.asp?legislature=11").getroot()
doc.make_links_absolute()
birth = re.compile('[0-9][0-9]?e?r? \\S+ [0-9]{4}')
name = re.compile('<[Hh]1[^>]*?>(.*?)</[Hh]1>')
for d in doc.xpath('id("deputes")/li/a'):
str = urllib.urlopen(d.get('href')).read()
print "%s;%s"%(name.search(str).expand("\\1"),birth.search(str).group().replace("1er ","1 "))
import urllib
import os
import lxml.html
import re
doc = lxml.html.parse("http://www.assemblee-nationale.fr/qui/xml/liste_alpha.asp?legislature=11").getroot()
doc.make_links_absolute()
birth = re.compile('[0-9][0-9]?e?r? \\S+ [0-9]{4}')
name = re.compile('<TITLE>(.*?)</TITLE>')
for d in doc.xpath('id("deputes")/li/a'):
str = urllib.urlopen(d.get('href')).read()
try:
strBirth = birth.search(str).group().replace("1er ","1 ")
except:
strBirth = "?"
print "%s;%s"%(name.search(str).expand("\\1"),strBirth)
import urllib
import os
import lxml.html
import re
os.environ['http_proxy'] = "http://<proxy-url>:<port>"
#doc = lxml.html.parse("http://fr.wikipedia.org/wiki/Liste_des_d%C3%A9put%C3%A9s_de_la_Ire_l%C3%A9gislature_de_la_Cinqui%C3%A8me_R%C3%A9publique").getroot()
#doc = lxml.html.parse("http://en.wikipedia.org/wiki/Main_Page").getroot()
#doc = lxml.html.parse("http://www.assemblee-nationale.fr/qui/xml/liste_alpha.asp?legislature=11").getroot()
str = urllib.urlopen("http://en.wikipedia.org/wiki/Main_Page").read()
print str
doc.make_links_absolute()
birth = re.compile('[0-9][0-9]?e?r? \\S+ [0-9]{4}')
name = re.compile('<[Hh]1[^>]*?>(.*?)</[Hh]1>')
for d in doc.xpath('/html/body/table/tr/td/a'):
print d
# str = urllib.urlopen(d.get('href')).read()
# print "%s;%s"%(name.search(str).expand("\\1"),birth.search(str).group().replace("1er ","1 "))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment