Skip to content

Instantly share code, notes, and snippets.

@TkTech
Last active December 17, 2015 08:49
Show Gist options
  • Save TkTech/5582408 to your computer and use it in GitHub Desktop.
Save TkTech/5582408 to your computer and use it in GitHub Desktop.
crawl lcbo in one titanic go.
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
Piss quick script to dump all of LCBO
"""
import sys
import urlparse
import requests
import lxml.html
def lcbo_crawl_inventory(sku):
r = requests.get(
'http://www.lcbo.com/lcbo-ear/lcbo/product/inventory/'
'searchResults.do',
params={
'language': 'EN',
'itemNumber': sku
}
)
html = lxml.html.fromstring(r.content)
html.make_links_absolute('http://www.lcbo.com/')
# Find the table which contains the inventory information,
# and skip one row (the header).
inventory_rows = html.xpath(
'//a[@id="col01"]/../../../tr[position()>1]'
)
product = {
'title': html.xpath(
'string(//input[@name="itemName"]/@value)'
),
'price': html.xpath(
'string(//input[@name="price"]/@value)'
),
'sku': html.xpath(
'string(//input[@name="itemNumber"]/@value)'
),
'image': html.xpath(
'string(//div[@id="image_holder"]/img/@src)'
),
'url': r.url,
'inventory': [{
'city': row.xpath(
'string(.//a[@class="item-details-col1"])'
),
'address': row.xpath(
'string(.//a[@class="item-details-col2"])'
),
'intersection': row.xpath(
'string(.//a[@class="item-details-col3"])'
),
'phone': row.xpath(
'string(.//a[@class="item-details-col4"])'
),
'date': row.xpath(
'string(.//a[@class="item-details-col5"])'
),
'qty': row.xpath(
'string(.//a[@class="item-details-col6"])'
)
} for row in inventory_rows]
}
return product
def lcbo_search(page=1, limit=100000):
r = requests.get(
'http://www.lcbo.com/lcbo-ear/lcbo/product/searchResults.do',
params=dict(page=page, resultsPerPage=limit)
)
html = lxml.html.fromstring(r.content)
html.make_links_absolute('http://www.lcbo.com/')
products = html.xpath(
'//a[@class="item-details-col1"]/@href'
)
for product in products:
parsed = urlparse.urlparse(product)
yield lcbo_crawl_inventory(
urlparse.parse_qs(parsed.query)['itemNumber']
)
def main(argv):
import pprint
for product in lcbo_search(limit=1):
pprint.pprint(product)
if __name__ == '__main__':
sys.exit(main(sys.argv))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment