Skip to content

Instantly share code, notes, and snippets.

@alexstorer
Created November 1, 2013 18:54
Show Gist options
  • Save alexstorer/7270126 to your computer and use it in GitHub Desktop.
Save alexstorer/7270126 to your computer and use it in GitHub Desktop.
Scrape Repetti Online Database
from selenium import webdriver
import time
import re
import csv
def isReady(browser):
return browser.execute_script("return document.readyState")=="complete"
browser = webdriver.Firefox()
# set up the csv dictionary writer
f = ['Comune', 'Comunit\xc3\xa0', 'Riferimenti', 'Provincia', 'Compartimento', 'N. scheda', 'UTM (32N)', 'Denominazione', 'Piviere', 'Volume', 'WGS 1984', 'Gauss Boaga', 'Pagina', 'Popolo', 'Toponimo IGM', 'Giurisdizione', 'Quadrante IGM', 'Stato', 'Diocesi', 'ID']
dw = csv.DictWriter(open('/Users/astorer/Work/dsmail/locs.csv','w'),fieldnames=f)
dw.writeheader()
# collect all of the pages
alldicts = []
for id in range(1,20): # of 5336
browser.get('http://193.205.4.99/repetti/tester.php?idx='+str(id))
while not isReady(browser):
time.sleep(0.1)
pagedata = browser.find_elements_by_xpath("//div[@id='main_sk']")[0].text
alldata = re.findall('(.*):(.*)',pagedata)
nextrow = dict()
for i in alldata:
nextrow[i[0].strip().encode('utf-8')] = i[1].strip().encode('utf-8')
alldicts.append(nextrow)
dw.writerow(nextrow)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment