Skip to content

Instantly share code, notes, and snippets.

@rafikahmed
Created September 30, 2019 17:26
Show Gist options
  • Save rafikahmed/4b07c23a676c0164be307707dba94898 to your computer and use it in GitHub Desktop.
Save rafikahmed/4b07c23a676c0164be307707dba94898 to your computer and use it in GitHub Desktop.
# -*- coding: utf-8 -*-
import scrapy
from scrapy.selector import Selector
from scrapy_splash import SplashRequest
import logging
import json
from w3lib.http import basic_auth_header
class ListingsSpider(scrapy.Spider):
name = 'listings'
allowed_domains = ['www.centris.ca']
script = '''
function main(splash, args)
splash.images_enabled = false
splash:on_request(function(request)
if request.url:find('ResponsiveWebService.asmx') or request.url:find('recaptcha') or request.url:find('css') or request.url:find('linkedin') or request.url:find('cdn') then
request:abort()
end
end)
assert(splash:go(args.url))
assert(splash:wait(0.5))
return splash:html()
end
'''
def start_requests(self):
yield scrapy.Request(
url="https://www.centris.ca/Mvc/Property/GetInscriptions",
method="POST",
headers={
'Content-Type': 'application/json'
},
body='{"startPosition": "0"}'
)
def parse(self, response):
resp = json.loads(response.body)
html = resp.get("d").get("Result").get("html")
sel = Selector(text=html)
for listing in sel.xpath("//div[@class='row templateListItem']"):
category = ' '.join(listing.xpath(".//div[@class='description']/h2[@itemprop='category']/span/text()").getall())
features = ' '.join(listing.xpath(".//div[@class='description']/p[@class='features border']/span/span/text()").getall())
price = listing.xpath(".//div[@class='description']/p[@class='price']/span[@id='BuyPrice']/text()").get()
address = listing.xpath(".//div[@class='description']/p[@class='address']/span/text()").get()
abs_url = listing.xpath(".//a[@class='btn a-more-detail']/@href").get()
rel_url = f"https://www.centris.ca{abs_url}"
yield SplashRequest(
url=rel_url,
callback=self.parse_detail,
endpoint='execute',
args={
'lua_source': self.script
},
meta={
'category': category,
'features': features,
'price': price,
'address': address
},
)
def parse_detail(self, response):
description = response.xpath("normalize-space(//div[@itemprop='description']/text())").get()
yield {
'category': response.request.meta['category'],
'features': response.request.meta['features'],
'price': response.request.meta['price'],
'address': response.request.meta['address'],
'description': description
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment