Created
May 9, 2012 13:51
-
-
Save davidwtbuxton/2644638 to your computer and use it in GitHub Desktop.
Scrape scotch prices, save as CSV
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import csv | |
import requests | |
from BeautifulSoup import BeautifulSoup | |
from datetime import datetime | |
# http://www.reddit.com/r/learnpython/comments/tczgd/help_me_improve_this_code_webscraping_liquor/ | |
# http://pastebin.com/TinHnCSp | |
# http://www.specsonline.com/cgi-bin/snf?body=/cgi-bin/prodlist&index=Liquors%7C255%7CSCOTCH+MALTS | |
URL = r'http://www.specsonline.com/cgi-bin/snf?body=/cgi-bin/prodlist&index=Liquors%7C255%7CSCOTCH+MALTS' | |
def read_csv(filename): | |
"""Read the names and prices from an open CSV file. | |
Returns a tuple of a list of names of scotches and a list of (time, prices) | |
tuples. | |
""" | |
rows = csv.reader(filename) | |
# Get the first row which is the name of the scotches. | |
try: | |
scotches = next(rows)[1:] | |
except StopIteration: | |
# There is no first row! Return empty lists. | |
return [], [] | |
prices_list = [] | |
# Read the time and the prices from the other rows. | |
for row in rows: | |
# First column is the time. | |
dt = row[0] | |
# Other columns are prices for scotches. | |
prices = dict(zip(scotches, row[1:])) | |
time_prices = (dt, prices) | |
prices_list.append(time_prices) | |
return scotches, prices_list | |
def scrape(url): | |
"""Scrape HTML for scotch prices. | |
Returns a dictionary mapping scotch names + sizes to prices. | |
""" | |
# Requires third-party requests module. | |
resp = requests.get(url) | |
soup = BeautifulSoup(resp.content) | |
data = {} | |
# Ignore the first <pre> element. | |
for pre in soup.findAll('pre')[1:]: | |
line = pre.string.split('\n')[0] # Ignore lower line | |
cols = line[:-4].split() # Chop off final '[12]' | |
size, price, case = cols[-3:] # Final 3 columns | |
name = ' '.join(cols[:-3]) # All except final 3 columns | |
label = name + '@' + size # Make key from name and size | |
data[label] = price | |
return data | |
# u'TULLIBARDINE MALT * AGED OAK 750ML 35.78 199.52 [ 6]\n 6/CS [SCOTLAND]' | |
def main(argv): | |
"""Updates a CSV file with prices for scotch sraped from the Web.""" | |
# The CSV file must be the first positional argument. | |
csv_name = argv[1] | |
old_data = [] | |
scotches = set() | |
# Get existing data from CSV file (if it exists). | |
try: | |
old_scotches, old_data = read_csv(open(csv_name, 'rU')) | |
scotches.update(old_scotches) | |
except IOError: | |
pass | |
# Get new prices | |
new_prices = scrape(URL) | |
# Add any new scotch names to the existing list of names. | |
scotches.update(new_prices) | |
# Put the scotch names in alphabetical order in a list. | |
scotches = sorted(scotches) | |
with open(csv_name, 'wb') as fh: | |
writer = csv.writer(fh) | |
# First row is the names of the scotches. | |
writer.writerow([''] + scotches) | |
# Write the old price data to the CSV. | |
for dt, prices in old_data: | |
writer.writerow([dt] + [prices.get(s, '') for s in scotches]) | |
# Write the new price data to the CSV. | |
new_dt = datetime.now() | |
writer.writerow([new_dt.isoformat()] + [new_prices.get(s, '') for s in scotches]) | |
if __name__ == "__main__": | |
import sys | |
main(sys.argv) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment