Skip to content

Instantly share code, notes, and snippets.

@rafikahmed
Created December 2, 2018 17:08
Show Gist options
  • Save rafikahmed/65bcf3776120ecc972e6c5341d34cb50 to your computer and use it in GitHub Desktop.
Save rafikahmed/65bcf3776120ecc972e6c5341d34cb50 to your computer and use it in GitHub Desktop.
import scrapy
from scrapy_splash import SplashRequest
class MySpide(scrapy.Spider):
name = 'google'
script = '''
function main(splash)
local host = "proxy.crawlera.com"
local port = 8010
local user = "key"
local password = ""
local session_header = "X-Crawlera-Session"
local session_id = "create"
splash:set_custom_headers({
["cookie"] = "CGIC=IlV0ZXh0L2h0bWwsYXBwbGljYXRpb24veGh0bWwreG1sLGFwcGxpY2F0aW9uL3htbDtxPTAuOSxpbWFnZS93ZWJwLGltYWdlL2FwbmcsKi8qO3E9MC44; SID=rwbrhPKXYexaoYx0YyXbupoGuREl9jMZcEFWLPWl397xMVmXPHNx5HFmSRIkRoZCrl7x9Q.; HSID=A06BH88RNcCUqOiMu; SSID=A2_xl7JZKHdN6Dh6S; APISID=Thfk1fkQ9j_QnxIF/AE3QvS_wthipPksbm; SAPISID=fsRwg4m6Hdjxf42f/AAi2iqQzl6RP0rkWo; CONSENT=YES+FR.ar+20150628-20-0; NID=148=oavRSmnuopClGYJH3CZF_4_tJWDXHvJyiv_jd1q1oZz1crxSbwqzphcHqtv35RnLZ7kbpECLov1O8SAYlp5j2pDomP_mA1XBK1mSPIg3DbNEeeUarE5M3d_M-hAm-AcMALrXBszpx2AVaqz2901H3BcubXOpkqlCeHiET3FLGszCeWLbvx-PnM98DHCpqSPDbfE-8h56hGeS5t4L4lhVvIW3IjXRsAKgye0OHeCSOlpgCqEdcDYlw6nF42QlhAEDvU1CWUG91XNe; DV=E6KlS8jXJiZXcBeTcarDqyHOksZLdhb9f7ObE3W79wEAACBIUPz5XM4iwgAAALCmhOrd16P7MQAAAMKB-zMlQ7QOEwAAAA; 1P_JAR=2018-11-30-12; SIDCC=ABtHo-GWvJDuPBN3b0j5YJJZ-m8KVC7Y7IjYtvi3OrWs3zyjYNSw0-g7uBLebLy-7a9o_tvZsA",
["referer"] = "https://www.google.com/"
})
splash:on_request(function (request)
request:set_header('X-Crawlera-Cookies', 'disable')
request:set_header(session_header, session_id)
request:set_proxy{host, port, username=user, password=password}
end)
splash:go(splash.args.url)
return splash:html()
end
'''
def start_requests(self):
yield SplashRequest(url='https://www.google.com/search?q=CBP1825&tbm=shop', endpoint='render.html', args={'wait': 5, 'timeout': 3600})
def parse(self, response):
all_products = response.xpath("//div[@class='eIuuYe']")
for product in all_products:
relative_path = product.xpath(".//a/@href").extract_first()
make_absolute_path = f"https://www.google.com{relative_path}"
yield SplashRequest(url=make_absolute_path, endpoint='render.html', callback=self.view_all ,args={'wait': 3, 'timeout': 3600}, dont_filter=True)
def view_all(self, response):
get_view_all_url = response.xpath("//a[@class='pag-detail-link']/@href").extract_first()
get_view_all_url_relative = f"https://www.google.com/{get_view_all_url}"
yield SplashRequest(url=get_view_all_url_relative, endpoint='render.html', callback=self.get_all_prices ,args={'wait': 3, 'timeout': 3600}, dont_filter=True)
def get_all_prices(self, response):
for row in response.xpath("//tr[@class='os-row']"):
yield {
'seller_name': row.xpath(".//span[@class='os-seller-name-primary']/a/text()").extract_first(),
'total_price': row.xpath(".//td[@class='os-total-col']/text()").extract_first()
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment