harshvb7 · October 1, 2017 08:02
diff --git a/cool.py b/cool.py
 import datetime
 import re
 import urllib

 from django.conf import settings

 import requests
 from lxml import html

 from .models import Product


 def db_insert(products):

    created = 0
    updated = 0
    result = []

    for prod in products:

        try:
            p = Product.objects.get(sku=prod.get('sku'))
            p.price = prod['price']
            p.old_price = prod['old_price']
            p.discount = prod['discount']
            p.last_updated = datetime.datetime.utcnow()
            updated += 1
        except:
            p = Product.objects.create(
                name=prod['name'],
                sku=prod['sku'],
                category=prod['category'],
                sub_category=prod['sub_category'],
                partner=prod['partner'],
                price=prod['price'],
                old_price=prod['old_price'],
                discount=prod['discount'],
                image=prod['image'],
                link=prod['link'],
                description=prod['description'],
                last_updated=datetime.now()
            )
            created += 1

    result.append(created)
    result.append(updated)

    return result


 class ScrapeAmazon:

    def __init__(self, scraper):
        self.scraper = scraper
        self.products = []

    def updated_scraped(self):
        self.scraper.last_scraped = datetime.now()
        self.scraper.save()

    def scrape(self):

        url = self.scraper.url

        page = requests.get(url)
        tree = html.fromstring(page.content)

        count = self.scraper.qnty - 1

        items = tree.cssselect('li.s-result-item')
        for num, i in enumerate(items):
            sku = i.attrib['data-asin']
            link = 'dp/' + sku + '/'

            a_tag = i.cssselect('h2.s-access-title')
            try:
                name = a_tag[0].text_content().encode('utf8')
            except:
                name = self.scraper.sub_cat.name

            link1 = self.scraper.partner.domain + link
            print link1

            product_page = requests.get(link1)
            tree1 = html.fromstring(product_page.content)

            try:
                price1 = str(tree1.cssselect('span#priceblock_saleprice')[0]
                             .text_content().encode('utf8').strip())
            except:
                price1 = 0

            if price1 == 0:
                try:
                    price1 = str(tree1.cssselect('span#priceblock_ourprice')[0]
                                 .text_content().encode('utf8').strip())
                except:
                    price1 = 0

            if price1 != 0:
                try:
                    price1 = price1.split('-')[0]
                    price1 = price1.split('.')[0]
                    price = re.sub("[^0-9]", "", price1)
                except:
                    price1 = price1.split('.')[0]
                    price = re.sub("[^0-9]", "", price1)

                if not price:
                    price = 0

                try:
                    old = tree.cssselect('td.a-text-strike')[0] \
                        .text_content().strip()
                except:
                    old = None
                    old_price = 0
                    discount = 0

                if old is not None:
                    old_price0 = old.split('.')
                    old_price1 = old_price0[0]
                    old_price = re.sub("[^0-9]", "", old_price1)

                try:
                    discount1 = tree.cssselect('tr#regularprice_savings')[0] \
                        .text_content().strip()
                    discount1 = discount1.split('(')[-1]
                    discount = re.sub("[^0-9]", "", discount1)
                except:
                    discount = 0

                description = tree1.cssselect('div#feature-bullets')[0] \
                    .text_content().strip()

                img_wrapper = tree1.cssselect('li.itemNo0 img')
                image1 = img_wrapper[0].attrib['src']
                try:
                    ext = image1.split('.')[-1]
                    image_name = sku + '.' + ext
                    local_url = settings.MEDIA_URL + 'products/' + image_name
                    local_db_url = 'products/' + image_name
                    urllib.urlretrieve(image1, local_url)
                    image = local_db_url
                except:
                    image = None

                p_data = {}
                p_data['name'] = name
                p_data['sku'] = sku
                p_data['category'] = self.scraper.sub_cat.category
                p_data['sub_category'] = self.scraper.sub_cat
                p_data['partner'] = self.scraper.partner
                p_data['price'] = price
                p_data['old_price'] = old_price
                p_data['discount'] = discount
                p_data['image'] = image
                p_data['link'] = link
                p_data['description'] = description

                self.products.append(p_data)

            if num == count:
                break

        return self.products


 def scrape_amazon(scraper):

    s = ScrapeAmazon(scraper)
    products = s.scrape()
    result = db_insert(products)
    s.updated_scraped()
    return result
	import datetime
	import re
	import urllib

	from django.conf import settings

	import requests
	from lxml import html

	from .models import Product


	def db_insert(products):

	created = 0
	updated = 0
	result = []

	for prod in products:

	try:
	p = Product.objects.get(sku=prod.get('sku'))
	p.price = prod['price']
	p.old_price = prod['old_price']
	p.discount = prod['discount']
	p.last_updated = datetime.datetime.utcnow()
	updated += 1
	except:
	p = Product.objects.create(
	name=prod['name'],
	sku=prod['sku'],
	category=prod['category'],
	sub_category=prod['sub_category'],
	partner=prod['partner'],
	price=prod['price'],
	old_price=prod['old_price'],
	discount=prod['discount'],
	image=prod['image'],
	link=prod['link'],
	description=prod['description'],
	last_updated=datetime.now()
	)
	created += 1

	result.append(created)
	result.append(updated)

	return result


	class ScrapeAmazon:

	def __init__(self, scraper):
	self.scraper = scraper
	self.products = []

	def updated_scraped(self):
	self.scraper.last_scraped = datetime.now()
	self.scraper.save()

	def scrape(self):

	url = self.scraper.url

	page = requests.get(url)
	tree = html.fromstring(page.content)

	count = self.scraper.qnty - 1

	items = tree.cssselect('li.s-result-item')
	for num, i in enumerate(items):
	sku = i.attrib['data-asin']
	link = 'dp/' + sku + '/'

	a_tag = i.cssselect('h2.s-access-title')
	try:
	name = a_tag[0].text_content().encode('utf8')
	except:
	name = self.scraper.sub_cat.name

	link1 = self.scraper.partner.domain + link
	print link1

	product_page = requests.get(link1)
	tree1 = html.fromstring(product_page.content)

	try:
	price1 = str(tree1.cssselect('span#priceblock_saleprice')[0]
	.text_content().encode('utf8').strip())
	except:
	price1 = 0

	if price1 == 0:
	try:
	price1 = str(tree1.cssselect('span#priceblock_ourprice')[0]
	.text_content().encode('utf8').strip())
	except:
	price1 = 0

	if price1 != 0:
	try:
	price1 = price1.split('-')[0]
	price1 = price1.split('.')[0]
	price = re.sub("[^0-9]", "", price1)
	except:
	price1 = price1.split('.')[0]
	price = re.sub("[^0-9]", "", price1)

	if not price:
	price = 0

	try:
	old = tree.cssselect('td.a-text-strike')[0] \
	.text_content().strip()
	except:
	old = None
	old_price = 0
	discount = 0

	if old is not None:
	old_price0 = old.split('.')
	old_price1 = old_price0[0]
	old_price = re.sub("[^0-9]", "", old_price1)

	try:
	discount1 = tree.cssselect('tr#regularprice_savings')[0] \
	.text_content().strip()
	discount1 = discount1.split('(')[-1]
	discount = re.sub("[^0-9]", "", discount1)
	except:
	discount = 0

	description = tree1.cssselect('div#feature-bullets')[0] \
	.text_content().strip()

	img_wrapper = tree1.cssselect('li.itemNo0 img')
	image1 = img_wrapper[0].attrib['src']
	try:
	ext = image1.split('.')[-1]
	image_name = sku + '.' + ext
	local_url = settings.MEDIA_URL + 'products/' + image_name
	local_db_url = 'products/' + image_name
	urllib.urlretrieve(image1, local_url)
	image = local_db_url
	except:
	image = None

	p_data = {}
	p_data['name'] = name
	p_data['sku'] = sku
	p_data['category'] = self.scraper.sub_cat.category
	p_data['sub_category'] = self.scraper.sub_cat
	p_data['partner'] = self.scraper.partner
	p_data['price'] = price
	p_data['old_price'] = old_price
	p_data['discount'] = discount
	p_data['image'] = image
	p_data['link'] = link
	p_data['description'] = description

	self.products.append(p_data)

	if num == count:
	break

	return self.products


	def scrape_amazon(scraper):

	s = ScrapeAmazon(scraper)
	products = s.scrape()
	result = db_insert(products)
	s.updated_scraped()
	return result
No results found