rafikahmed · September 30, 2019 17:26
diff --git a/centris.py b/centris.py
 # -*- coding: utf-8 -*-
 import scrapy
 from scrapy.selector import Selector
 from scrapy_splash import SplashRequest
 import logging
 import json
 from w3lib.http import basic_auth_header

 class ListingsSpider(scrapy.Spider):
    name = 'listings'
    allowed_domains = ['www.centris.ca']

    script = '''
        function main(splash, args)
        splash.images_enabled = false
        splash:on_request(function(request)
            if request.url:find('ResponsiveWebService.asmx') or request.url:find('recaptcha') or request.url:find('css') or request.url:find('linkedin') or request.url:find('cdn') then
                request:abort()
            end
            end)
        assert(splash:go(args.url))
        assert(splash:wait(0.5))
        return splash:html()
        end
    '''
    
    def start_requests(self):
        yield scrapy.Request(
            url="https://www.centris.ca/Mvc/Property/GetInscriptions",
            method="POST",
            headers={
                'Content-Type': 'application/json'
            },
            body='{"startPosition": "0"}'
        )

    def parse(self, response):
        resp = json.loads(response.body)
        html = resp.get("d").get("Result").get("html")
        sel = Selector(text=html)
        for listing in sel.xpath("//div[@class='row templateListItem']"):
            category = ' '.join(listing.xpath(".//div[@class='description']/h2[@itemprop='category']/span/text()").getall())
            features = ' '.join(listing.xpath(".//div[@class='description']/p[@class='features border']/span/span/text()").getall())
            price = listing.xpath(".//div[@class='description']/p[@class='price']/span[@id='BuyPrice']/text()").get()
            address = listing.xpath(".//div[@class='description']/p[@class='address']/span/text()").get()
            abs_url = listing.xpath(".//a[@class='btn a-more-detail']/@href").get()
            rel_url = f"https://www.centris.ca{abs_url}"
            yield SplashRequest(
                url=rel_url,
                callback=self.parse_detail,
                endpoint='execute',
                args={
                    'lua_source': self.script
                },
                meta={
                    'category': category,
                    'features': features,
                    'price': price,
                    'address': address
                },

            )
        
    def parse_detail(self, response):
        description = response.xpath("normalize-space(//div[@itemprop='description']/text())").get()
        yield {
            'category': response.request.meta['category'],
            'features': response.request.meta['features'],
            'price': response.request.meta['price'],
            'address': response.request.meta['address'],
            'description': description
        }
	# -- coding: utf-8 --
	import scrapy
	from scrapy.selector import Selector
	from scrapy_splash import SplashRequest
	import logging
	import json
	from w3lib.http import basic_auth_header

	class ListingsSpider(scrapy.Spider):
	name = 'listings'
	allowed_domains = ['www.centris.ca']

	script = '''
	function main(splash, args)
	splash.images_enabled = false
	splash:on_request(function(request)
	if request.url:find('ResponsiveWebService.asmx') or request.url:find('recaptcha') or request.url:find('css') or request.url:find('linkedin') or request.url:find('cdn') then
	request:abort()
	end
	end)
	assert(splash:go(args.url))
	assert(splash:wait(0.5))
	return splash:html()
	end
	'''

	def start_requests(self):
	yield scrapy.Request(
	url="https://www.centris.ca/Mvc/Property/GetInscriptions",
	method="POST",
	headers={
	'Content-Type': 'application/json'
	},
	body='{"startPosition": "0"}'
	)

	def parse(self, response):
	resp = json.loads(response.body)
	html = resp.get("d").get("Result").get("html")
	sel = Selector(text=html)
	for listing in sel.xpath("//div[@class='row templateListItem']"):
	category = ' '.join(listing.xpath(".//div[@class='description']/h2[@itemprop='category']/span/text()").getall())
	features = ' '.join(listing.xpath(".//div[@class='description']/p[@class='features border']/span/span/text()").getall())
	price = listing.xpath(".//div[@class='description']/p[@class='price']/span[@id='BuyPrice']/text()").get()
	address = listing.xpath(".//div[@class='description']/p[@class='address']/span/text()").get()
	abs_url = listing.xpath(".//a[@class='btn a-more-detail']/@href").get()
	rel_url = f"https://www.centris.ca{abs_url}"
	yield SplashRequest(
	url=rel_url,
	callback=self.parse_detail,
	endpoint='execute',
	args={
	'lua_source': self.script
	},
	meta={
	'category': category,
	'features': features,
	'price': price,
	'address': address
	},

	)

	def parse_detail(self, response):
	description = response.xpath("normalize-space(//div[@itemprop='description']/text())").get()
	yield {
	'category': response.request.meta['category'],
	'features': response.request.meta['features'],
	'price': response.request.meta['price'],
	'address': response.request.meta['address'],
	'description': description
	}