crimeminister · October 24, 2011 06:10
diff --git a/wardscrape.py b/wardscrape.py
 # wardscrape.py
 # encoding=utf-8

 """
 Fetch a web page about each ward in the City of Toronto and extract
 the population of that ward. Store the data in a tab-separated output
 file.
 """

 from BeautifulSoup import BeautifulSoup
 import urllib2
 import sys

 # The URLs from which we fetch HTML pages look this template but with
 # the '{}' replaced by a valid Toronto ward number.
 URL_TEMPLATE = 'http://www.toronto.ca/wards2000/ward{}.htm'

 # The range of valid Toronto ward numbers.
 #
 # (NOTE: I skipped 1 in the cycle because the code structure on Ward
 # 1's webpage is an outlier relative to the others. Please don't use
 # that page as a reference).
 WARD_MIN = 2
 WARD_MAX = 45

 # Create a file called "ward_pop.tsv".

 with open('ward_pop.tsv', 'w') as f:

    # Make the script cycle through Toronto Ward numbers and define
    # each number as a ward. Load and parse the ward profile webpages
    # one at a time.

    for wardnum in range(WARD_MIN, WARD_MAX):

        wardstamp = "Ward {}".format(wardnum)
        print("Getting data for {}".format(wardstamp))
        url = URL_TEMPLATE.format(wardnum)

        page = urllib2.urlopen(url)
        soup = BeautifulSoup(page)

        # Extract the sentence with the ward's total population.

        paragraphs = soup.findAll('p')

        # Our element has this structure:
        #   <p><font ...>This is our sentence.</font></p>
        # Pull out the element contents using the 'text' attribute.
        sentence = paragraphs[1].font.text

        # Split the sentence up into an array of words.

        words = sentence.split()

        # Extract the ward number and population from the word array.

        pop = str(words[5])

        # Write to file:

        line = "Ward {}\t{}\n".format(wardnum, pop)
        f.write(line)
	# wardscrape.py
	# encoding=utf-8

	"""
	Fetch a web page about each ward in the City of Toronto and extract
	the population of that ward. Store the data in a tab-separated output
	file.
	"""

	from BeautifulSoup import BeautifulSoup
	import urllib2
	import sys

	# The URLs from which we fetch HTML pages look this template but with
	# the '{}' replaced by a valid Toronto ward number.
	URL_TEMPLATE = 'http://www.toronto.ca/wards2000/ward{}.htm'

	# The range of valid Toronto ward numbers.
	#
	# (NOTE: I skipped 1 in the cycle because the code structure on Ward
	# 1's webpage is an outlier relative to the others. Please don't use
	# that page as a reference).
	WARD_MIN = 2
	WARD_MAX = 45

	# Create a file called "ward_pop.tsv".

	with open('ward_pop.tsv', 'w') as f:

	# Make the script cycle through Toronto Ward numbers and define
	# each number as a ward. Load and parse the ward profile webpages
	# one at a time.

	for wardnum in range(WARD_MIN, WARD_MAX):

	wardstamp = "Ward {}".format(wardnum)
	print("Getting data for {}".format(wardstamp))
	url = URL_TEMPLATE.format(wardnum)

	page = urllib2.urlopen(url)
	soup = BeautifulSoup(page)

	# Extract the sentence with the ward's total population.

	paragraphs = soup.findAll('p')

	# Our element has this structure:
	# <p><font ...>This is our sentence.</font></p>
	# Pull out the element contents using the 'text' attribute.
	sentence = paragraphs[1].font.text

	# Split the sentence up into an array of words.

	words = sentence.split()

	# Extract the ward number and population from the word array.

	pop = str(words[5])

	# Write to file:

	line = "Ward {}\t{}\n".format(wardnum, pop)
	f.write(line)