Created
October 24, 2011 06:10
-
-
Save crimeminister/1308453 to your computer and use it in GitHub Desktop.
Scrape ward population numbers from City of Toronto web pages.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# wardscrape.py | |
# encoding=utf-8 | |
""" | |
Fetch a web page about each ward in the City of Toronto and extract | |
the population of that ward. Store the data in a tab-separated output | |
file. | |
""" | |
from BeautifulSoup import BeautifulSoup | |
import urllib2 | |
import sys | |
# The URLs from which we fetch HTML pages look this template but with | |
# the '{}' replaced by a valid Toronto ward number. | |
URL_TEMPLATE = 'http://www.toronto.ca/wards2000/ward{}.htm' | |
# The range of valid Toronto ward numbers. | |
# | |
# (NOTE: I skipped 1 in the cycle because the code structure on Ward | |
# 1's webpage is an outlier relative to the others. Please don't use | |
# that page as a reference). | |
WARD_MIN = 2 | |
WARD_MAX = 45 | |
# Create a file called "ward_pop.tsv". | |
with open('ward_pop.tsv', 'w') as f: | |
# Make the script cycle through Toronto Ward numbers and define | |
# each number as a ward. Load and parse the ward profile webpages | |
# one at a time. | |
for wardnum in range(WARD_MIN, WARD_MAX): | |
wardstamp = "Ward {}".format(wardnum) | |
print("Getting data for {}".format(wardstamp)) | |
url = URL_TEMPLATE.format(wardnum) | |
page = urllib2.urlopen(url) | |
soup = BeautifulSoup(page) | |
# Extract the sentence with the ward's total population. | |
paragraphs = soup.findAll('p') | |
# Our element has this structure: | |
# <p><font ...>This is our sentence.</font></p> | |
# Pull out the element contents using the 'text' attribute. | |
sentence = paragraphs[1].font.text | |
# Split the sentence up into an array of words. | |
words = sentence.split() | |
# Extract the ward number and population from the word array. | |
pop = str(words[5]) | |
# Write to file: | |
line = "Ward {}\t{}\n".format(wardnum, pop) | |
f.write(line) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment