Last active
December 17, 2015 08:49
-
-
Save TkTech/5582408 to your computer and use it in GitHub Desktop.
crawl lcbo in one titanic go.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# -*- coding: utf-8 -*- | |
""" | |
Piss quick script to dump all of LCBO | |
""" | |
import sys | |
import urlparse | |
import requests | |
import lxml.html | |
def lcbo_crawl_inventory(sku): | |
r = requests.get( | |
'http://www.lcbo.com/lcbo-ear/lcbo/product/inventory/' | |
'searchResults.do', | |
params={ | |
'language': 'EN', | |
'itemNumber': sku | |
} | |
) | |
html = lxml.html.fromstring(r.content) | |
html.make_links_absolute('http://www.lcbo.com/') | |
# Find the table which contains the inventory information, | |
# and skip one row (the header). | |
inventory_rows = html.xpath( | |
'//a[@id="col01"]/../../../tr[position()>1]' | |
) | |
product = { | |
'title': html.xpath( | |
'string(//input[@name="itemName"]/@value)' | |
), | |
'price': html.xpath( | |
'string(//input[@name="price"]/@value)' | |
), | |
'sku': html.xpath( | |
'string(//input[@name="itemNumber"]/@value)' | |
), | |
'image': html.xpath( | |
'string(//div[@id="image_holder"]/img/@src)' | |
), | |
'url': r.url, | |
'inventory': [{ | |
'city': row.xpath( | |
'string(.//a[@class="item-details-col1"])' | |
), | |
'address': row.xpath( | |
'string(.//a[@class="item-details-col2"])' | |
), | |
'intersection': row.xpath( | |
'string(.//a[@class="item-details-col3"])' | |
), | |
'phone': row.xpath( | |
'string(.//a[@class="item-details-col4"])' | |
), | |
'date': row.xpath( | |
'string(.//a[@class="item-details-col5"])' | |
), | |
'qty': row.xpath( | |
'string(.//a[@class="item-details-col6"])' | |
) | |
} for row in inventory_rows] | |
} | |
return product | |
def lcbo_search(page=1, limit=100000): | |
r = requests.get( | |
'http://www.lcbo.com/lcbo-ear/lcbo/product/searchResults.do', | |
params=dict(page=page, resultsPerPage=limit) | |
) | |
html = lxml.html.fromstring(r.content) | |
html.make_links_absolute('http://www.lcbo.com/') | |
products = html.xpath( | |
'//a[@class="item-details-col1"]/@href' | |
) | |
for product in products: | |
parsed = urlparse.urlparse(product) | |
yield lcbo_crawl_inventory( | |
urlparse.parse_qs(parsed.query)['itemNumber'] | |
) | |
def main(argv): | |
import pprint | |
for product in lcbo_search(limit=1): | |
pprint.pprint(product) | |
if __name__ == '__main__': | |
sys.exit(main(sys.argv)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment