Skip to content

Instantly share code, notes, and snippets.

@crimeminister
Created October 24, 2011 06:10
Show Gist options
  • Save crimeminister/1308453 to your computer and use it in GitHub Desktop.
Save crimeminister/1308453 to your computer and use it in GitHub Desktop.
Scrape ward population numbers from City of Toronto web pages.
# wardscrape.py
# encoding=utf-8
"""
Fetch a web page about each ward in the City of Toronto and extract
the population of that ward. Store the data in a tab-separated output
file.
"""
from BeautifulSoup import BeautifulSoup
import urllib2
import sys
# The URLs from which we fetch HTML pages look this template but with
# the '{}' replaced by a valid Toronto ward number.
URL_TEMPLATE = 'http://www.toronto.ca/wards2000/ward{}.htm'
# The range of valid Toronto ward numbers.
#
# (NOTE: I skipped 1 in the cycle because the code structure on Ward
# 1's webpage is an outlier relative to the others. Please don't use
# that page as a reference).
WARD_MIN = 2
WARD_MAX = 45
# Create a file called "ward_pop.tsv".
with open('ward_pop.tsv', 'w') as f:
# Make the script cycle through Toronto Ward numbers and define
# each number as a ward. Load and parse the ward profile webpages
# one at a time.
for wardnum in range(WARD_MIN, WARD_MAX):
wardstamp = "Ward {}".format(wardnum)
print("Getting data for {}".format(wardstamp))
url = URL_TEMPLATE.format(wardnum)
page = urllib2.urlopen(url)
soup = BeautifulSoup(page)
# Extract the sentence with the ward's total population.
paragraphs = soup.findAll('p')
# Our element has this structure:
# <p><font ...>This is our sentence.</font></p>
# Pull out the element contents using the 'text' attribute.
sentence = paragraphs[1].font.text
# Split the sentence up into an array of words.
words = sentence.split()
# Extract the ward number and population from the word array.
pop = str(words[5])
# Write to file:
line = "Ward {}\t{}\n".format(wardnum, pop)
f.write(line)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment