Skip to content

Instantly share code, notes, and snippets.

@ishritam
Last active August 17, 2020 10:42
Show Gist options
  • Save ishritam/a19910a32a1c0b6ecc4eac20072a2d3c to your computer and use it in GitHub Desktop.
Save ishritam/a19910a32a1c0b6ecc4eac20072a2d3c to your computer and use it in GitHub Desktop.
import scrapy
import re
import json
from pprint import pprint
from string import ascii_lowercase
import redis
class MgLVSpider(scrapy.Spider):
#name of the spider
name = 'mg_lv'
#connect to Redis Server
clint = redis.Redis(host= "127.0.0.1", port=6379,charset="utf-8", decode_responses=True)
allowed_domain = 'https://www.1mg.com'
def start_requests(self):
alphas = list(ascii_lowercase)
#the pages can be extracted using Xpath, but here I have taken it manualy
pages = [726,207, 664, 369, 299, 259, 230, 102, 145, 43, 122, 320, 438, 305, 330, 429, 31,
396, 384, 421, 71, 191, 53, 49, 16,197 ]
start_urls= []
for f, b in zip(alphas, pages):
for i in range(1,b+1):
start_url = f'https://www.1mg.com/pharmacy_api_gateway/v4/drug_skus/by_prefix?prefix_term={f}&page={i}&per_page=30'
start_urls.append(start_url)
for url in start_urls:
yield scrapy.Request(url=url, callback=self.parse)
def parse(self, response):
result = json.loads(response.body)
for i in range(len(result['data']['skus'])):
self.clint.lpush('urls', self.allowed_domain +result['data']['skus'][i]['slug'])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment