-
-
Save rafikahmed/65bcf3776120ecc972e6c5341d34cb50 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import scrapy | |
from scrapy_splash import SplashRequest | |
class MySpide(scrapy.Spider): | |
name = 'google' | |
script = ''' | |
function main(splash) | |
local host = "proxy.crawlera.com" | |
local port = 8010 | |
local user = "key" | |
local password = "" | |
local session_header = "X-Crawlera-Session" | |
local session_id = "create" | |
splash:set_custom_headers({ | |
["cookie"] = "CGIC=IlV0ZXh0L2h0bWwsYXBwbGljYXRpb24veGh0bWwreG1sLGFwcGxpY2F0aW9uL3htbDtxPTAuOSxpbWFnZS93ZWJwLGltYWdlL2FwbmcsKi8qO3E9MC44; SID=rwbrhPKXYexaoYx0YyXbupoGuREl9jMZcEFWLPWl397xMVmXPHNx5HFmSRIkRoZCrl7x9Q.; HSID=A06BH88RNcCUqOiMu; SSID=A2_xl7JZKHdN6Dh6S; APISID=Thfk1fkQ9j_QnxIF/AE3QvS_wthipPksbm; SAPISID=fsRwg4m6Hdjxf42f/AAi2iqQzl6RP0rkWo; CONSENT=YES+FR.ar+20150628-20-0; NID=148=oavRSmnuopClGYJH3CZF_4_tJWDXHvJyiv_jd1q1oZz1crxSbwqzphcHqtv35RnLZ7kbpECLov1O8SAYlp5j2pDomP_mA1XBK1mSPIg3DbNEeeUarE5M3d_M-hAm-AcMALrXBszpx2AVaqz2901H3BcubXOpkqlCeHiET3FLGszCeWLbvx-PnM98DHCpqSPDbfE-8h56hGeS5t4L4lhVvIW3IjXRsAKgye0OHeCSOlpgCqEdcDYlw6nF42QlhAEDvU1CWUG91XNe; DV=E6KlS8jXJiZXcBeTcarDqyHOksZLdhb9f7ObE3W79wEAACBIUPz5XM4iwgAAALCmhOrd16P7MQAAAMKB-zMlQ7QOEwAAAA; 1P_JAR=2018-11-30-12; SIDCC=ABtHo-GWvJDuPBN3b0j5YJJZ-m8KVC7Y7IjYtvi3OrWs3zyjYNSw0-g7uBLebLy-7a9o_tvZsA", | |
["referer"] = "https://www.google.com/" | |
}) | |
splash:on_request(function (request) | |
request:set_header('X-Crawlera-Cookies', 'disable') | |
request:set_header(session_header, session_id) | |
request:set_proxy{host, port, username=user, password=password} | |
end) | |
splash:go(splash.args.url) | |
return splash:html() | |
end | |
''' | |
def start_requests(self): | |
yield SplashRequest(url='https://www.google.com/search?q=CBP1825&tbm=shop', endpoint='render.html', args={'wait': 5, 'timeout': 3600}) | |
def parse(self, response): | |
all_products = response.xpath("//div[@class='eIuuYe']") | |
for product in all_products: | |
relative_path = product.xpath(".//a/@href").extract_first() | |
make_absolute_path = f"https://www.google.com{relative_path}" | |
yield SplashRequest(url=make_absolute_path, endpoint='render.html', callback=self.view_all ,args={'wait': 3, 'timeout': 3600}, dont_filter=True) | |
def view_all(self, response): | |
get_view_all_url = response.xpath("//a[@class='pag-detail-link']/@href").extract_first() | |
get_view_all_url_relative = f"https://www.google.com/{get_view_all_url}" | |
yield SplashRequest(url=get_view_all_url_relative, endpoint='render.html', callback=self.get_all_prices ,args={'wait': 3, 'timeout': 3600}, dont_filter=True) | |
def get_all_prices(self, response): | |
for row in response.xpath("//tr[@class='os-row']"): | |
yield { | |
'seller_name': row.xpath(".//span[@class='os-seller-name-primary']/a/text()").extract_first(), | |
'total_price': row.xpath(".//td[@class='os-total-col']/text()").extract_first() | |
} | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment