Created
July 2, 2017 08:13
-
-
Save sourcepirate/86be1d7b641c4e9eed01eb5dc0530dbd to your computer and use it in GitHub Desktop.
Scrapping all amazon laptops data.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
"""scrape amazon laptop and desktop prices""" | |
import os | |
import queue | |
import asyncio | |
import warnings | |
import peewee as db | |
from multiprocessing.managers import SyncManager | |
from multiprocessing import Process, RLock | |
from data import data | |
from data import requests | |
from json import dumps | |
from csv import DictWriter | |
from multiprocessing.managers import MakeProxyType | |
# warnings.filterwarnings("ignore", category=UserWarning, module='bs4') | |
BaseSetProxy = MakeProxyType('BaseSetProxy', ('__and__', '__contains__', '__iand__', '__ior__', | |
'__isub__', '__ixor__', '__len__', '__or__', '__rand__', '__ror__', '__rsub__', | |
'__rxor__', '__sub__', '__xor__', 'add', 'clear', 'copy', 'difference','__contains__' | |
'difference_update', 'discard', 'intersection', 'intersection_update', 'isdisjoint', | |
'issubset', 'issuperset', 'pop', 'remove', 'symmetric_difference', | |
'symmetric_difference_update', 'union', 'update' | |
)) | |
class SetProxy(BaseSetProxy): | |
# in-place hooks need to return `self`, specify these manually | |
def __iand__(self, value): | |
self._callmethod('__iand__', (value,)) | |
return self | |
def __ior__(self, value): | |
self._callmethod('__ior__', (value,)) | |
return self | |
def __isub__(self, value): | |
self._callmethod('__isub__', (value,)) | |
return self | |
def __ixor__(self, value): | |
self._callmethod('__ixor__', (value,)) | |
return self | |
IP = 'localhost' | |
PORT = 9080 | |
AUTH_KEY = b"abcdefgs" | |
database = db.SqliteDatabase(os.path.join(os.path.dirname(__file__), "data.db")) | |
class Computers(db.Model): | |
title = db.CharField(max_length=255) | |
specification = db.TextField(null=True) | |
price = db.CharField(max_length=255) | |
class Meta: | |
database = database | |
database.create_tables([Computers]) | |
link_queue = queue.Queue() | |
done_queue = set() | |
lock = RLock() | |
class JobQueueManager(SyncManager): | |
pass | |
JobQueueManager.register('link_queue', lambda : link_queue) | |
JobQueueManager.register('done_queue', lambda : done_queue, SetProxy) | |
class AmazonSpecifications(data.Item): | |
name = data.TextField(selector=".label") | |
value = data.TextField(selector=".value") | |
class AmazonItem(data.Item): | |
title = data.TextField(selector="#productTitle") | |
price = data.TextField(selector="#priceblock_ourprice") | |
specification = data.RelationalField(AmazonSpecifications, selector=".pdTab tr") | |
products = data.AttributeValueField(selector=".a-spacing-top-small .a-link-normal", attr="href", repeated=True) | |
@property | |
def json(self): | |
dct_json = {getattr(i, 'name'):getattr(i, 'value') for i in self.specification} | |
return dict(title=self.title, price=self.price, specification=dct_json) | |
class Meta: | |
base_url = "http://www.amazon.in" | |
async def grab_details(url): | |
values = await AmazonItem.one(path=url) | |
print("Grabing details for url {}".format(url)) | |
json_val = values.json | |
if not values.title: | |
return [] | |
title = values.title | |
bits = title.split("/") | |
sublinks = values.products | |
print(sublinks) | |
if "laptop" not in title.lower() or len(bits) < 3: | |
return sublinks if len(sublinks) > 0 else [] | |
try: | |
Computers.create(title=values.title, | |
price=json_val["price"], | |
specification=dumps(json_val)) | |
except db.PeeweeException: | |
print("Exception occured on saving {}".format(title)) | |
return sublinks if len(sublinks) > 0 else [] | |
def target_worker(): | |
"""worker code""" | |
loop = asyncio.get_event_loop() | |
while True: | |
client = JobQueueManager(address=(IP, PORT), authkey=AUTH_KEY) | |
client.connect() | |
link_q = client.link_queue() | |
done_q = client.done_queue() | |
initial_link = link_q.get() | |
print("Fetching link {}".format(initial_link)) | |
if initial_link in done_q: | |
print("Data already present") | |
continue | |
follow_links = loop.run_until_complete(grab_details(initial_link)) | |
with lock: | |
done_q.add(initial_link) | |
if not follow_links: | |
print("No links follow for {}".format(initial_link)) | |
continue | |
for link in follow_links: | |
if "product-reviews" not in link and "#" not in link: | |
link_q.put(link) | |
def get_process(n): | |
"""getting all the process to work""" | |
print("Starting {} processes".format(n)) | |
pids = [] | |
for _ in range(n): | |
p = Process(target=target_worker) | |
p.start() | |
pids.append(p) | |
return pids | |
server = JobQueueManager(address=(IP, PORT), authkey=AUTH_KEY) | |
server.start() | |
server.link_queue().put("http://www.amazon.in/Lenovo-Ideapad-IP310-lenovo-Ideapad-IP310-7thGen-Corei5-8GB-1TB-2GB-Graphics-DOS-Laptop-Silver-80TV00Y1IH/dp/B06X3T14WN/ref=pd_sbs_147_5?_encoding=UTF8&psc=1&refRID=ZS5V38QTZSVTWKPQCJK9") | |
server.link_queue().put("http://www.amazon.in/Dell-Inspiron-5567-15-6-inch-i5-7200U/dp/B072BN6LN9/ref=pd_sbs_147_1?_encoding=UTF8&psc=1&refRID=G990MB5WBACASF7RZWN9") | |
server.link_queue().put("http://www.amazon.in/HP-15-AY503TX-15-6-inch-i5-6200U-Graphics/dp/B01LXJN33Y") | |
pids = get_process(10) | |
for pid in pids: | |
pid.join() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment