Skip to content

Instantly share code, notes, and snippets.

@harshvb7
Created October 1, 2017 08:02
Show Gist options
  • Save harshvb7/8eeab43d5196449d0bd6d255d49ce76a to your computer and use it in GitHub Desktop.
Save harshvb7/8eeab43d5196449d0bd6d255d49ce76a to your computer and use it in GitHub Desktop.
import datetime
import re
import urllib
from django.conf import settings
import requests
from lxml import html
from .models import Product
def db_insert(products):
created = 0
updated = 0
result = []
for prod in products:
try:
p = Product.objects.get(sku=prod.get('sku'))
p.price = prod['price']
p.old_price = prod['old_price']
p.discount = prod['discount']
p.last_updated = datetime.datetime.utcnow()
updated += 1
except:
p = Product.objects.create(
name=prod['name'],
sku=prod['sku'],
category=prod['category'],
sub_category=prod['sub_category'],
partner=prod['partner'],
price=prod['price'],
old_price=prod['old_price'],
discount=prod['discount'],
image=prod['image'],
link=prod['link'],
description=prod['description'],
last_updated=datetime.now()
)
created += 1
result.append(created)
result.append(updated)
return result
class ScrapeAmazon:
def __init__(self, scraper):
self.scraper = scraper
self.products = []
def updated_scraped(self):
self.scraper.last_scraped = datetime.now()
self.scraper.save()
def scrape(self):
url = self.scraper.url
page = requests.get(url)
tree = html.fromstring(page.content)
count = self.scraper.qnty - 1
items = tree.cssselect('li.s-result-item')
for num, i in enumerate(items):
sku = i.attrib['data-asin']
link = 'dp/' + sku + '/'
a_tag = i.cssselect('h2.s-access-title')
try:
name = a_tag[0].text_content().encode('utf8')
except:
name = self.scraper.sub_cat.name
link1 = self.scraper.partner.domain + link
print link1
product_page = requests.get(link1)
tree1 = html.fromstring(product_page.content)
try:
price1 = str(tree1.cssselect('span#priceblock_saleprice')[0]
.text_content().encode('utf8').strip())
except:
price1 = 0
if price1 == 0:
try:
price1 = str(tree1.cssselect('span#priceblock_ourprice')[0]
.text_content().encode('utf8').strip())
except:
price1 = 0
if price1 != 0:
try:
price1 = price1.split('-')[0]
price1 = price1.split('.')[0]
price = re.sub("[^0-9]", "", price1)
except:
price1 = price1.split('.')[0]
price = re.sub("[^0-9]", "", price1)
if not price:
price = 0
try:
old = tree.cssselect('td.a-text-strike')[0] \
.text_content().strip()
except:
old = None
old_price = 0
discount = 0
if old is not None:
old_price0 = old.split('.')
old_price1 = old_price0[0]
old_price = re.sub("[^0-9]", "", old_price1)
try:
discount1 = tree.cssselect('tr#regularprice_savings')[0] \
.text_content().strip()
discount1 = discount1.split('(')[-1]
discount = re.sub("[^0-9]", "", discount1)
except:
discount = 0
description = tree1.cssselect('div#feature-bullets')[0] \
.text_content().strip()
img_wrapper = tree1.cssselect('li.itemNo0 img')
image1 = img_wrapper[0].attrib['src']
try:
ext = image1.split('.')[-1]
image_name = sku + '.' + ext
local_url = settings.MEDIA_URL + 'products/' + image_name
local_db_url = 'products/' + image_name
urllib.urlretrieve(image1, local_url)
image = local_db_url
except:
image = None
p_data = {}
p_data['name'] = name
p_data['sku'] = sku
p_data['category'] = self.scraper.sub_cat.category
p_data['sub_category'] = self.scraper.sub_cat
p_data['partner'] = self.scraper.partner
p_data['price'] = price
p_data['old_price'] = old_price
p_data['discount'] = discount
p_data['image'] = image
p_data['link'] = link
p_data['description'] = description
self.products.append(p_data)
if num == count:
break
return self.products
def scrape_amazon(scraper):
s = ScrapeAmazon(scraper)
products = s.scrape()
result = db_insert(products)
s.updated_scraped()
return result
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment