Skip to content

Instantly share code, notes, and snippets.

@joannecheng
Last active December 11, 2015 22:38
Show Gist options
  • Save joannecheng/4670457 to your computer and use it in GitHub Desktop.
Save joannecheng/4670457 to your computer and use it in GitHub Desktop.
ScraperWiki for Denver Streets and Sidewalks data https://scraperwiki.com/scrapers/denver_streets_and_sidewalks/
# TODO -
# Geocode addresses
# Parse start/end date
#
import scraperwiki
import urllib2
import re
from bs4 import BeautifulSoup
def get_dates(date_string):
re_date = "Start Date:\s+(\d+\/\d+\/\d+)\s+-\s+End Date:\s+(\d+\/\d+\/\d+)"
regex = re.compile(re_date)
r = regex.search(date_string)
return r.groups()
url = "https://www.denvergov.org/Portals/707/documents/mydenverdrive/1-22-25-2013.pdf"
xml = scraperwiki.pdftoxml(urllib2.urlopen(url).read())
parsed = BeautifulSoup(xml).text.split("\n")
filtered_list = parsed[parsed.index('Location: '):]
closures = []
i = 0
current_closure = -1
while i < len(filtered_list):
text = filtered_list[i]
if text == "Location: ":
closures.append({})
current_closure = len(closures) - 1
i += 1
closures[current_closure]['location'] = filtered_list[i]
elif text == "Type: ":
i += 1
closures[current_closure]['type'] = filtered_list[i]
elif text == "Date: ":
i += 1
dates = get_dates(filtered_list[i])
closures[current_closure]['start_date'] = dates[0]
closures[current_closure]['end_date'] = dates[1]
elif text == "Time: ":
i += 1
closures[current_closure]['time'] = filtered_list[i]
elif text == "Purpose: ":
i += 1
closures[current_closure]['purpose'] = filtered_list[i]
elif text == "Contractor: ":
closures[current_closure]['contractor'] = filtered_list[i]
i+= 1
for closure in closures:
scraperwiki.sqlite.save(unique_keys=['location'], data=closure)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment