Last active
August 17, 2020 10:42
-
-
Save ishritam/a19910a32a1c0b6ecc4eac20072a2d3c to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import scrapy | |
import re | |
import json | |
from pprint import pprint | |
from string import ascii_lowercase | |
import redis | |
class MgLVSpider(scrapy.Spider): | |
#name of the spider | |
name = 'mg_lv' | |
#connect to Redis Server | |
clint = redis.Redis(host= "127.0.0.1", port=6379,charset="utf-8", decode_responses=True) | |
allowed_domain = 'https://www.1mg.com' | |
def start_requests(self): | |
alphas = list(ascii_lowercase) | |
#the pages can be extracted using Xpath, but here I have taken it manualy | |
pages = [726,207, 664, 369, 299, 259, 230, 102, 145, 43, 122, 320, 438, 305, 330, 429, 31, | |
396, 384, 421, 71, 191, 53, 49, 16,197 ] | |
start_urls= [] | |
for f, b in zip(alphas, pages): | |
for i in range(1,b+1): | |
start_url = f'https://www.1mg.com/pharmacy_api_gateway/v4/drug_skus/by_prefix?prefix_term={f}&page={i}&per_page=30' | |
start_urls.append(start_url) | |
for url in start_urls: | |
yield scrapy.Request(url=url, callback=self.parse) | |
def parse(self, response): | |
result = json.loads(response.body) | |
for i in range(len(result['data']['skus'])): | |
self.clint.lpush('urls', self.allowed_domain +result['data']['skus'][i]['slug']) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment