Created
November 9, 2010 15:34
-
-
Save gpiancastelli/669243 to your computer and use it in GitHub Desktop.
A Python web scraper for wish lists on The Book Depository
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# A web scraper for wish lists on The Book Depository. | |
# | |
# TBD offers a mail notification service for books in a wish list when some | |
# price drops. Unfortunately, you are notified only if prices drop of 10% or | |
# more; also, the message you receive does not contain the new dropped price, | |
# nor the old, higher price. Finally, when I got one of those notifications, | |
# that very day I visited the book page on TBD only to discover that the price | |
# got higher again. Not very effective as a service, you know. | |
# | |
# So I decided to write a script myself. | |
# | |
# The scraper uses lxml to parse HTML documents; this is the only external | |
# dependency. | |
# | |
# Books data are cached in a JSON file in the user's home directory. The format | |
# is a dictionary where keys are the book's title, values a 2-tuple of prices: | |
# first the current price (i.e. the price of the book as of the latest run of | |
# the script), second the lowest price to date. | |
# | |
# Tested with Python 2.6.5. | |
import lxml.html | |
import os.path | |
from threading import Thread | |
from urllib2 import urlopen | |
# replace your own CODE and NAME in the wish list URL | |
URL = 'http://www.bookdepository.co.uk/wishlist/CODE/NAME' | |
# avoid downloading the first wish list page more than once | |
URL += '/?&page=1#pagination' | |
CACHE = os.path.join(os.path.expanduser('~'), '.bookdepository.json') | |
OUT_OF_STOCK = 9999 | |
class Scraper(Thread): | |
def __init__(self, url): | |
Thread.__init__(self) | |
self.url = url | |
def run(self): | |
doc = urlopen(self.url) | |
self.root = lxml.html.parse(doc).getroot() | |
div = self.root.get_element_by_id('account') | |
wishlist = div.xpath('.//ul')[0] | |
# harvest books in wish list page | |
self.books = {} | |
for book in wishlist.iterchildren(): | |
title = book.xpath('div/h3/a')[0].text | |
# cut the currency symbol | |
price = book.xpath('.//span[@class="price"]/strong') | |
if price: | |
self.books[title] = float(price[0].text[1:]) | |
else: | |
# book is out of stock | |
self.books[title] = OUT_OF_STOCK | |
def pages(self): | |
'''Get references to wish list pages linked from this page.''' | |
pagination = self.root.get_element_by_id('pagination') | |
pages = pagination.xpath('.//span[contains(@class, "search pageNumber")]') | |
return [page.getchildren()[0].attrib['href'] | |
for page in pages if 'active' not in page.attrib['class']] | |
import json | |
from Queue import Queue, Empty | |
def scrape(): | |
current_books = [] | |
seen_pages = set() | |
seen_pages.add(URL) | |
def producer(minions, pages): | |
while len(current_books) < len(seen_pages): | |
page = pages.get(True) | |
if page is None and pages.empty(): | |
break | |
s = Scraper(page) | |
s.start() | |
minions.put(s) | |
def consumer(minions, pages): | |
while len(current_books) < len(seen_pages): | |
m = minions.get(True) | |
m.join() | |
current_books.append(m.books) | |
s = set(m.pages()).difference(seen_pages) | |
if not s: | |
pages.put(None) | |
continue | |
for page in set(m.pages()).difference(seen_pages): | |
seen_pages.add(page) | |
pages.put(page) | |
minions = Queue() | |
pages = Queue() | |
pages.put(URL) | |
produce = Thread(target=producer, args=(minions, pages)) | |
consume = Thread(target=consumer, args=(minions, pages)) | |
produce.start() | |
consume.start() | |
produce.join() | |
consume.join() | |
return current_books | |
current_books = {} | |
for books in scrape(): | |
current_books.update(books) | |
cached_books = {} | |
if os.path.exists(CACHE): | |
with open(CACHE) as f: | |
cached_books = json.load(f) | |
books = {} | |
books_with_dropped_price = [] | |
for title, current_price in current_books.items(): | |
if title in cached_books: | |
old_price, lowest_price = cached_books[title] | |
if current_price <= lowest_price and current_price is not OUT_OF_STOCK: | |
lowest_price = current_price | |
books_with_dropped_price.append((title, current_price)) | |
books[title] = (current_price, lowest_price) | |
else: | |
books[title] = (current_price, current_price) | |
for title, price in books_with_dropped_price: | |
print '%s is at its lowest price of %.2f' % (title, price) | |
with open(CACHE, 'w') as f: | |
json.dump(books, f) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment