rafikahmed · December 2, 2018 17:08
diff --git a/h.py b/h.py
 import scrapy
 from scrapy_splash import SplashRequest

 class MySpide(scrapy.Spider):
    name = 'google'

    script = '''
    function main(splash)
        local host = "proxy.crawlera.com"
        local port = 8010
        local user = "key"
        local password = ""
        local session_header = "X-Crawlera-Session"
        local session_id = "create"
            splash:set_custom_headers({
        ["cookie"] = "CGIC=IlV0ZXh0L2h0bWwsYXBwbGljYXRpb24veGh0bWwreG1sLGFwcGxpY2F0aW9uL3htbDtxPTAuOSxpbWFnZS93ZWJwLGltYWdlL2FwbmcsKi8qO3E9MC44; SID=rwbrhPKXYexaoYx0YyXbupoGuREl9jMZcEFWLPWl397xMVmXPHNx5HFmSRIkRoZCrl7x9Q.; HSID=A06BH88RNcCUqOiMu; SSID=A2_xl7JZKHdN6Dh6S; APISID=Thfk1fkQ9j_QnxIF/AE3QvS_wthipPksbm; SAPISID=fsRwg4m6Hdjxf42f/AAi2iqQzl6RP0rkWo; CONSENT=YES+FR.ar+20150628-20-0; NID=148=oavRSmnuopClGYJH3CZF_4_tJWDXHvJyiv_jd1q1oZz1crxSbwqzphcHqtv35RnLZ7kbpECLov1O8SAYlp5j2pDomP_mA1XBK1mSPIg3DbNEeeUarE5M3d_M-hAm-AcMALrXBszpx2AVaqz2901H3BcubXOpkqlCeHiET3FLGszCeWLbvx-PnM98DHCpqSPDbfE-8h56hGeS5t4L4lhVvIW3IjXRsAKgye0OHeCSOlpgCqEdcDYlw6nF42QlhAEDvU1CWUG91XNe; DV=E6KlS8jXJiZXcBeTcarDqyHOksZLdhb9f7ObE3W79wEAACBIUPz5XM4iwgAAALCmhOrd16P7MQAAAMKB-zMlQ7QOEwAAAA; 1P_JAR=2018-11-30-12; SIDCC=ABtHo-GWvJDuPBN3b0j5YJJZ-m8KVC7Y7IjYtvi3OrWs3zyjYNSw0-g7uBLebLy-7a9o_tvZsA",
        ["referer"] = "https://www.google.com/"
        })
    
        splash:on_request(function (request)
            request:set_header('X-Crawlera-Cookies', 'disable')
            request:set_header(session_header, session_id)
            request:set_proxy{host, port, username=user, password=password}
        end)
    
        splash:go(splash.args.url)
        return splash:html()

    end
    '''

    def start_requests(self):
        yield SplashRequest(url='https://www.google.com/search?q=CBP1825&tbm=shop', endpoint='render.html', args={'wait': 5, 'timeout': 3600})


    def parse(self, response):
        all_products = response.xpath("//div[@class='eIuuYe']")

        for product in all_products:
            relative_path = product.xpath(".//a/@href").extract_first()
            make_absolute_path = f"https://www.google.com{relative_path}"
            yield SplashRequest(url=make_absolute_path, endpoint='render.html', callback=self.view_all ,args={'wait': 3, 'timeout': 3600}, dont_filter=True)


    def view_all(self, response):
        get_view_all_url = response.xpath("//a[@class='pag-detail-link']/@href").extract_first()
        get_view_all_url_relative = f"https://www.google.com/{get_view_all_url}"
        yield SplashRequest(url=get_view_all_url_relative, endpoint='render.html', callback=self.get_all_prices ,args={'wait': 3, 'timeout': 3600}, dont_filter=True)
    
    def get_all_prices(self, response):
        for row in response.xpath("//tr[@class='os-row']"):
            yield {
                'seller_name': row.xpath(".//span[@class='os-seller-name-primary']/a/text()").extract_first(),
                'total_price': row.xpath(".//td[@class='os-total-col']/text()").extract_first()
            }
	import scrapy
	from scrapy_splash import SplashRequest

	class MySpide(scrapy.Spider):
	name = 'google'

	script = '''
	function main(splash)
	local host = "proxy.crawlera.com"
	local port = 8010
	local user = "key"
	local password = ""
	local session_header = "X-Crawlera-Session"
	local session_id = "create"
	splash:set_custom_headers({
	["cookie"] = "CGIC=IlV0ZXh0L2h0bWwsYXBwbGljYXRpb24veGh0bWwreG1sLGFwcGxpY2F0aW9uL3htbDtxPTAuOSxpbWFnZS93ZWJwLGltYWdlL2FwbmcsKi8qO3E9MC44; SID=rwbrhPKXYexaoYx0YyXbupoGuREl9jMZcEFWLPWl397xMVmXPHNx5HFmSRIkRoZCrl7x9Q.; HSID=A06BH88RNcCUqOiMu; SSID=A2_xl7JZKHdN6Dh6S; APISID=Thfk1fkQ9j_QnxIF/AE3QvS_wthipPksbm; SAPISID=fsRwg4m6Hdjxf42f/AAi2iqQzl6RP0rkWo; CONSENT=YES+FR.ar+20150628-20-0; NID=148=oavRSmnuopClGYJH3CZF_4_tJWDXHvJyiv_jd1q1oZz1crxSbwqzphcHqtv35RnLZ7kbpECLov1O8SAYlp5j2pDomP_mA1XBK1mSPIg3DbNEeeUarE5M3d_M-hAm-AcMALrXBszpx2AVaqz2901H3BcubXOpkqlCeHiET3FLGszCeWLbvx-PnM98DHCpqSPDbfE-8h56hGeS5t4L4lhVvIW3IjXRsAKgye0OHeCSOlpgCqEdcDYlw6nF42QlhAEDvU1CWUG91XNe; DV=E6KlS8jXJiZXcBeTcarDqyHOksZLdhb9f7ObE3W79wEAACBIUPz5XM4iwgAAALCmhOrd16P7MQAAAMKB-zMlQ7QOEwAAAA; 1P_JAR=2018-11-30-12; SIDCC=ABtHo-GWvJDuPBN3b0j5YJJZ-m8KVC7Y7IjYtvi3OrWs3zyjYNSw0-g7uBLebLy-7a9o_tvZsA",
	["referer"] = "https://www.google.com/"
	})

	splash:on_request(function (request)
	request:set_header('X-Crawlera-Cookies', 'disable')
	request:set_header(session_header, session_id)
	request:set_proxy{host, port, username=user, password=password}
	end)

	splash:go(splash.args.url)
	return splash:html()

	end
	'''

	def start_requests(self):
	yield SplashRequest(url='https://www.google.com/search?q=CBP1825&tbm=shop', endpoint='render.html', args={'wait': 5, 'timeout': 3600})


	def parse(self, response):
	all_products = response.xpath("//div[@class='eIuuYe']")

	for product in all_products:
	relative_path = product.xpath(".//a/@href").extract_first()
	make_absolute_path = f"https://www.google.com{relative_path}"
	yield SplashRequest(url=make_absolute_path, endpoint='render.html', callback=self.view_all ,args={'wait': 3, 'timeout': 3600}, dont_filter=True)


	def view_all(self, response):
	get_view_all_url = response.xpath("//a[@class='pag-detail-link']/@href").extract_first()
	get_view_all_url_relative = f"https://www.google.com/{get_view_all_url}"
	yield SplashRequest(url=get_view_all_url_relative, endpoint='render.html', callback=self.get_all_prices ,args={'wait': 3, 'timeout': 3600}, dont_filter=True)

	def get_all_prices(self, response):
	for row in response.xpath("//tr[@class='os-row']"):
	yield {
	'seller_name': row.xpath(".//span[@class='os-seller-name-primary']/a/text()").extract_first(),
	'total_price': row.xpath(".//td[@class='os-total-col']/text()").extract_first()
	}