Created
July 4, 2017 03:27
-
-
Save nuklea/9bf765ae0b8ef95e84fed56900fa5b89 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
class GeocoderPipeline(object): | |
namespaces = {'gml': 'http://www.opengis.net/gml', | |
'ymaps': 'http://maps.yandex.ru/ymaps/1.x', | |
'geocoder': 'http://maps.yandex.ru/geocoder/1.x'} | |
@classmethod | |
def from_crawler(cls, crawler): | |
try: | |
pipe = cls.from_settings(crawler.settings) | |
except AttributeError: | |
pipe = cls() | |
pipe.crawler = crawler | |
return pipe | |
def process_item(self, item, spider): | |
previous = Advert.objects.filter(source=item['source'], remote_id=item['remote_id']) \ | |
.exclude(coordinates__isnull=True).values('coordinates', 'address', 'remote_address').first() | |
if previous and item['remote_address'] == previous['remote_address']: | |
item['address'] = previous['address'] | |
item['coordinates'] = previous['coordinates'] | |
return item | |
dfd = self.crawler.engine.download(self.get_request(item), spider) | |
return dfd.addCallback(self.parse_response, item) | |
def get_request(self, item): | |
qs = {'geocode': 'Россия, Красноярский край, Красноярск, {remote_address}'.format(**item), 'lang': 'ru-RU'} | |
return FormRequest('https://geocode-maps.yandex.ru/1.x/', method='GET', formdata=qs) | |
def parse_response(self, response, item): | |
s = response.selector | |
for namespace, schema in self.namespaces.items(): | |
s.register_namespace(namespace, schema) | |
# Нам нужны координаты дома, а не улицы и так далее | |
geo_object = s.xpath('(//geocoder:kind[text()="house"]//ancestor::ymaps:GeoObject)[1]') | |
if not geo_object: | |
# Пропускаем объявления с кривым адресом | |
raise DropItem('Location "{remote_address}" does not exist'.format(**item)) | |
# Всегда записываем нормализованный адрес | |
item['address'] = geo_object.xpath('./gml:name/text()').extract()[0] | |
if all(item.get(key) for key in ('longitude', 'latitude')): | |
item['coordinates'] = Point(item['longitude'], item['latitude']) | |
else: | |
item['coordinates'] = Point(*map(float, geo_object.xpath('./gml:Point/gml:pos/text()') | |
.extract()[0].split())) | |
return item |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment