Skip to content

Instantly share code, notes, and snippets.

@dcalacci
Last active May 14, 2020 02:12
Show Gist options
  • Save dcalacci/99c931c198443b08dcc1eab762aedad2 to your computer and use it in GitHub Desktop.
Save dcalacci/99c931c198443b08dcc1eab762aedad2 to your computer and use it in GitHub Desktop.
Daily script to pull boston incident reports
from bs4 import BeautifulSoup
import time
import os
import requests
import urllib
f = urllib.request.urlopen('https://data.boston.gov/dataset/crime-incident-reports-august-2015-to-date-source-new-system')
soup = BeautifulSoup(f.fp)
# finds all the "primary" buttons on the page -- these are those blue ones --
# that match the right name. If this script breaks, it's probably because this
# name changed.
maybe_buttons = soup.find(title="Crime Incident Reports (August 2015 - To Date) (Source - New System)").findNext("div").select('.btn-primary')
csv_url = None
for btn in maybe_buttons:
is_csv = btn.attrs["href"].split(".")[-1] == "csv"
if is_csv:
csv_url = btn.attrs["href"]
outdir = "crime-incident-reports-daily"
filename = time.strftime("%Y-%m-%d.csv")
if csv_url is not None:
response = requests.get(csv_url)
if not os.path.exists(outdir):
os.makedirs(outdir)
with open(os.path.join(outdir, filename), "wb") as f:
f.write(response.content)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment