Skip to content

Instantly share code, notes, and snippets.

@sourcepirate
Created July 2, 2017 08:13
Show Gist options
  • Save sourcepirate/86be1d7b641c4e9eed01eb5dc0530dbd to your computer and use it in GitHub Desktop.
Save sourcepirate/86be1d7b641c4e9eed01eb5dc0530dbd to your computer and use it in GitHub Desktop.
Scrapping all amazon laptops data.
"""scrape amazon laptop and desktop prices"""
import os
import queue
import asyncio
import warnings
import peewee as db
from multiprocessing.managers import SyncManager
from multiprocessing import Process, RLock
from data import data
from data import requests
from json import dumps
from csv import DictWriter
from multiprocessing.managers import MakeProxyType
# warnings.filterwarnings("ignore", category=UserWarning, module='bs4')
BaseSetProxy = MakeProxyType('BaseSetProxy', ('__and__', '__contains__', '__iand__', '__ior__',
'__isub__', '__ixor__', '__len__', '__or__', '__rand__', '__ror__', '__rsub__',
'__rxor__', '__sub__', '__xor__', 'add', 'clear', 'copy', 'difference','__contains__'
'difference_update', 'discard', 'intersection', 'intersection_update', 'isdisjoint',
'issubset', 'issuperset', 'pop', 'remove', 'symmetric_difference',
'symmetric_difference_update', 'union', 'update'
))
class SetProxy(BaseSetProxy):
# in-place hooks need to return `self`, specify these manually
def __iand__(self, value):
self._callmethod('__iand__', (value,))
return self
def __ior__(self, value):
self._callmethod('__ior__', (value,))
return self
def __isub__(self, value):
self._callmethod('__isub__', (value,))
return self
def __ixor__(self, value):
self._callmethod('__ixor__', (value,))
return self
IP = 'localhost'
PORT = 9080
AUTH_KEY = b"abcdefgs"
database = db.SqliteDatabase(os.path.join(os.path.dirname(__file__), "data.db"))
class Computers(db.Model):
title = db.CharField(max_length=255)
specification = db.TextField(null=True)
price = db.CharField(max_length=255)
class Meta:
database = database
database.create_tables([Computers])
link_queue = queue.Queue()
done_queue = set()
lock = RLock()
class JobQueueManager(SyncManager):
pass
JobQueueManager.register('link_queue', lambda : link_queue)
JobQueueManager.register('done_queue', lambda : done_queue, SetProxy)
class AmazonSpecifications(data.Item):
name = data.TextField(selector=".label")
value = data.TextField(selector=".value")
class AmazonItem(data.Item):
title = data.TextField(selector="#productTitle")
price = data.TextField(selector="#priceblock_ourprice")
specification = data.RelationalField(AmazonSpecifications, selector=".pdTab tr")
products = data.AttributeValueField(selector=".a-spacing-top-small .a-link-normal", attr="href", repeated=True)
@property
def json(self):
dct_json = {getattr(i, 'name'):getattr(i, 'value') for i in self.specification}
return dict(title=self.title, price=self.price, specification=dct_json)
class Meta:
base_url = "http://www.amazon.in"
async def grab_details(url):
values = await AmazonItem.one(path=url)
print("Grabing details for url {}".format(url))
json_val = values.json
if not values.title:
return []
title = values.title
bits = title.split("/")
sublinks = values.products
print(sublinks)
if "laptop" not in title.lower() or len(bits) < 3:
return sublinks if len(sublinks) > 0 else []
try:
Computers.create(title=values.title,
price=json_val["price"],
specification=dumps(json_val))
except db.PeeweeException:
print("Exception occured on saving {}".format(title))
return sublinks if len(sublinks) > 0 else []
def target_worker():
"""worker code"""
loop = asyncio.get_event_loop()
while True:
client = JobQueueManager(address=(IP, PORT), authkey=AUTH_KEY)
client.connect()
link_q = client.link_queue()
done_q = client.done_queue()
initial_link = link_q.get()
print("Fetching link {}".format(initial_link))
if initial_link in done_q:
print("Data already present")
continue
follow_links = loop.run_until_complete(grab_details(initial_link))
with lock:
done_q.add(initial_link)
if not follow_links:
print("No links follow for {}".format(initial_link))
continue
for link in follow_links:
if "product-reviews" not in link and "#" not in link:
link_q.put(link)
def get_process(n):
"""getting all the process to work"""
print("Starting {} processes".format(n))
pids = []
for _ in range(n):
p = Process(target=target_worker)
p.start()
pids.append(p)
return pids
server = JobQueueManager(address=(IP, PORT), authkey=AUTH_KEY)
server.start()
server.link_queue().put("http://www.amazon.in/Lenovo-Ideapad-IP310-lenovo-Ideapad-IP310-7thGen-Corei5-8GB-1TB-2GB-Graphics-DOS-Laptop-Silver-80TV00Y1IH/dp/B06X3T14WN/ref=pd_sbs_147_5?_encoding=UTF8&psc=1&refRID=ZS5V38QTZSVTWKPQCJK9")
server.link_queue().put("http://www.amazon.in/Dell-Inspiron-5567-15-6-inch-i5-7200U/dp/B072BN6LN9/ref=pd_sbs_147_1?_encoding=UTF8&psc=1&refRID=G990MB5WBACASF7RZWN9")
server.link_queue().put("http://www.amazon.in/HP-15-AY503TX-15-6-inch-i5-6200U-Graphics/dp/B01LXJN33Y")
pids = get_process(10)
for pid in pids:
pid.join()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment