davidwtbuxton · May 9, 2012 13:51
diff --git a/scotch.py b/scotch.py
 import csv
 import requests
 from BeautifulSoup import BeautifulSoup
 from datetime import datetime


 # http://www.reddit.com/r/learnpython/comments/tczgd/help_me_improve_this_code_webscraping_liquor/
 # http://pastebin.com/TinHnCSp
 # http://www.specsonline.com/cgi-bin/snf?body=/cgi-bin/prodlist&index=Liquors%7C255%7CSCOTCH+MALTS


 URL = r'http://www.specsonline.com/cgi-bin/snf?body=/cgi-bin/prodlist&index=Liquors%7C255%7CSCOTCH+MALTS'


 def read_csv(filename):
    """Read the names and prices from an open CSV file.
    
    Returns a tuple of a list of names of scotches and a list of (time, prices)
    tuples.
    """
    rows = csv.reader(filename)
    
    # Get the first row which is the name of the scotches.
    try:
        scotches = next(rows)[1:]
    except StopIteration:
        # There is no first row! Return empty lists.
        return [], []
    
    prices_list = []
    
    # Read the time and the prices from the other rows.
    for row in rows:
        # First column is the time.
        dt = row[0]
        # Other columns are prices for scotches.
        prices = dict(zip(scotches, row[1:]))
        time_prices = (dt, prices)
        prices_list.append(time_prices)

    return scotches, prices_list


 def scrape(url):
    """Scrape HTML for scotch prices.
    
    Returns a dictionary mapping scotch names + sizes to prices.
    """
    # Requires third-party requests module.
    resp = requests.get(url)
    soup = BeautifulSoup(resp.content)
    
    data = {}
    
    # Ignore the first <pre> element.
    for pre in soup.findAll('pre')[1:]:
        line = pre.string.split('\n')[0] # Ignore lower line
        cols = line[:-4].split() # Chop off final '[12]'
        size, price, case = cols[-3:] # Final 3 columns
        name = ' '.join(cols[:-3]) # All except final 3 columns
        label = name + '@' + size # Make key from name and size
        data[label] = price
    
    return data

        
 # u'TULLIBARDINE MALT * AGED OAK         750ML   35.78  199.52 [ 6]\n    6/CS  [SCOTLAND]'
        

 def main(argv):
    """Updates a CSV file with prices for scotch sraped from the Web."""
    # The CSV file must be the first positional argument.
    csv_name = argv[1]
    
    old_data = []
    scotches = set()
    
    # Get existing data from CSV file (if it exists).
    try:
        old_scotches, old_data = read_csv(open(csv_name, 'rU'))
        scotches.update(old_scotches)
    except IOError:
        pass
    
    # Get new prices
    new_prices = scrape(URL)
    # Add any new scotch names to the existing list of names.
    scotches.update(new_prices)
    # Put the scotch names in alphabetical order in a list.
    scotches = sorted(scotches)
    
    with open(csv_name, 'wb') as fh:
        writer = csv.writer(fh)
        # First row is the names of the scotches.
        writer.writerow([''] + scotches)
        
        # Write the old price data to the CSV.
        for dt, prices in old_data:
            writer.writerow([dt] + [prices.get(s, '') for s in scotches])
        
        # Write the new price data to the CSV.
        new_dt = datetime.now()
        writer.writerow([new_dt.isoformat()] + [new_prices.get(s, '') for s in scotches])


 if __name__ == "__main__":
    import sys
    
    main(sys.argv)
	import csv
	import requests
	from BeautifulSoup import BeautifulSoup
	from datetime import datetime


	# http://www.reddit.com/r/learnpython/comments/tczgd/help_me_improve_this_code_webscraping_liquor/
	# http://pastebin.com/TinHnCSp
	# http://www.specsonline.com/cgi-bin/snf?body=/cgi-bin/prodlist&index=Liquors%7C255%7CSCOTCH+MALTS


	URL = r'http://www.specsonline.com/cgi-bin/snf?body=/cgi-bin/prodlist&index=Liquors%7C255%7CSCOTCH+MALTS'


	def read_csv(filename):
	"""Read the names and prices from an open CSV file.

	Returns a tuple of a list of names of scotches and a list of (time, prices)
	tuples.
	"""
	rows = csv.reader(filename)

	# Get the first row which is the name of the scotches.
	try:
	scotches = next(rows)[1:]
	except StopIteration:
	# There is no first row! Return empty lists.
	return [], []

	prices_list = []

	# Read the time and the prices from the other rows.
	for row in rows:
	# First column is the time.
	dt = row[0]
	# Other columns are prices for scotches.
	prices = dict(zip(scotches, row[1:]))
	time_prices = (dt, prices)
	prices_list.append(time_prices)

	return scotches, prices_list


	def scrape(url):
	"""Scrape HTML for scotch prices.

	Returns a dictionary mapping scotch names + sizes to prices.
	"""
	# Requires third-party requests module.
	resp = requests.get(url)
	soup = BeautifulSoup(resp.content)

	data = {}

	# Ignore the first <pre> element.
	for pre in soup.findAll('pre')[1:]:
	line = pre.string.split('\n')[0] # Ignore lower line
	cols = line[:-4].split() # Chop off final '[12]'
	size, price, case = cols[-3:] # Final 3 columns
	name = ' '.join(cols[:-3]) # All except final 3 columns
	label = name + '@' + size # Make key from name and size
	data[label] = price

	return data


	# u'TULLIBARDINE MALT * AGED OAK 750ML 35.78 199.52 [ 6]\n 6/CS [SCOTLAND]'


	def main(argv):
	"""Updates a CSV file with prices for scotch sraped from the Web."""
	# The CSV file must be the first positional argument.
	csv_name = argv[1]

	old_data = []
	scotches = set()

	# Get existing data from CSV file (if it exists).
	try:
	old_scotches, old_data = read_csv(open(csv_name, 'rU'))
	scotches.update(old_scotches)
	except IOError:
	pass

	# Get new prices
	new_prices = scrape(URL)
	# Add any new scotch names to the existing list of names.
	scotches.update(new_prices)
	# Put the scotch names in alphabetical order in a list.
	scotches = sorted(scotches)

	with open(csv_name, 'wb') as fh:
	writer = csv.writer(fh)
	# First row is the names of the scotches.
	writer.writerow([''] + scotches)

	# Write the old price data to the CSV.
	for dt, prices in old_data:
	writer.writerow([dt] + [prices.get(s, '') for s in scotches])

	# Write the new price data to the CSV.
	new_dt = datetime.now()
	writer.writerow([new_dt.isoformat()] + [new_prices.get(s, '') for s in scotches])


	if __name__ == "__main__":
	import sys

	main(sys.argv)