Last active
July 20, 2017 15:23
-
-
Save Saigesp/457c5601112775309ad47c1073a0c350 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: UTF-8 -*- | |
import time, json, urllib, math | |
from pymongo import MongoClient | |
# python -c "from crawler2 import *; iterar_sobre_area(44.00766,-1.083201,40.544736,-1.75312)" | |
def deg_to_rad(deg): | |
return deg * (math.pi/180) | |
def get_distance_from_latlon_km(lat1,lon1,lat2,lon2): | |
r = 6371 # Radius of the earth in km | |
dlat = deg_to_rad(lat2-lat1) | |
dlon = deg_to_rad(lon2-lon1) | |
a = math.sin(dlat/2) * math.sin(dlat/2) + math.cos(deg_to_rad(lat1)) * math.cos(deg_to_rad(lat2)) * math.sin(dlon/2) * math.sin(dlon/2) | |
c = 2 * math.atan2(math.sqrt(a), math.sqrt(1-a)) | |
d = round((r * c),4) # Distance in km | |
return d | |
def iterar_sobre_area(lat_ne, lon_ne, lat_so, lon_so): | |
areas_por_analizar = [[lat_ne, lon_ne, lat_so, lon_so]] | |
areas_divididas = [] | |
resultados_por_pagina = 16 | |
max_paginas = 17 | |
max_resultado = resultados_por_pagina * max_paginas | |
airbnb_base_url = 'https://www.airbnb.es/rooms/' | |
airbnb_base_id = 'airbnb' | |
connection = MongoClient('mongodb://localhost:27017/') | |
db = connection['local'] | |
collection = db['crawler'] | |
def iterate_area(lat_noreste, lon_noreste, lat_suroeste, lon_suroeste): | |
pagenum = 0 | |
try: | |
time.sleep(3) | |
try: | |
# Llamar a la API | |
api_url = 'https://www.airbnb.es/api/v2/explore_tabs?version=1.2.1&_format=for_explore_search_web&supports_for_you_v3=true&screen_size=large&timezone_offset=120&auto_ib=true&tab_id=home_tab&allow_override%5B%5D=&ne_lat='+str(lat_noreste)+'&ne_lng='+str(lon_noreste)+'&s_tag=YGeKtnKW&search_by_map=true§ion_offset='+str(pagenum)+'&sw_lat='+str(lat_suroeste)+'&sw_lng='+str(lon_suroeste)+'&key=d306zoyjsyarp7ifhu67rjxn52tv0t20¤cy=&locale=es' | |
web = urllib.urlopen(api_url).read() | |
# print(web) | |
web = web.decode('utf-8') | |
jsondata = json.loads(web) | |
except Exception as e: | |
print('No se pudo parsear el json') | |
print(e) | |
return False | |
resultados = jsondata['explore_tabs'][0]['pagination_metadata']['items_offset'] | |
tiene_siguiente_pagina = jsondata['explore_tabs'][0]['pagination_metadata']['has_next_page'] | |
resultados_leidos = jsondata['explore_tabs'][0]['home_tab_metadata']['pagination']['next_offset'] | |
total_query = jsondata['explore_tabs'][0]['home_tab_metadata']['listings_count'] | |
# Comprobar resultado | |
if total_query == 0 or resultados == 0: | |
print('Sin resultados') | |
# Si resultado menor que número máximo, empezar paginación | |
elif total_query < max_resultado: | |
print('Menos de 300 resultados') | |
while tiene_siguiente_pagina or pagenum == 0: | |
print('Pagina ' + str(pagenum)) | |
time.sleep(3) | |
# Llamar a la API | |
api_url = 'https://www.airbnb.es/api/v2/explore_tabs?version=1.2.1&_format=for_explore_search_web&supports_for_you_v3=true&screen_size=large&timezone_offset=120&auto_ib=true&tab_id=home_tab&allow_override%5B%5D=&ne_lat='+str(lat_noreste)+'&ne_lng='+str(lon_noreste)+'&s_tag=YGeKtnKW&search_by_map=true§ion_offset='+str(pagenum)+'&sw_lat='+str(lat_suroeste)+'&sw_lng='+str(lon_suroeste)+'&key=d306zoyjsyarp7ifhu67rjxn52tv0t20¤cy=&locale=es' | |
web = urllib.urlopen(api_url).read() | |
web = web.decode('utf-8') | |
jsondata = json.loads(web) | |
items = jsondata['explore_tabs'][0]['sections'][0]['listings'] | |
items_para_guardar = [] | |
for item in items: | |
obj = { | |
'_id' : airbnb_base_id + str(item['listing']['id']), | |
'url' : airbnb_base_url + str(item['listing']['id']), | |
'lnglat': { | |
'type': 'Point', | |
'coordinates': [ item['listing']['lng'] , item['listing']['lat'] ], | |
} | |
} | |
items_para_guardar.append(obj) | |
try: | |
collection.insert_many(items_para_guardar, ordered=False) | |
except Exception as e: | |
print('Error guardando documentos') | |
print(e) | |
tiene_siguiente_pagina = jsondata['explore_tabs'][0]['pagination_metadata']['has_next_page'] | |
pagenum += 1 | |
# Si no, dividir areas y guardar en | |
else: | |
print('Area muy grande') | |
lon_central = lon_suroeste + ((lon_noreste - lon_suroeste)/2) | |
lat_central = lat_suroeste + ((lat_noreste - lat_suroeste)/2) | |
areas_divididas.append([round(lat_central,5), round(lon_central,5), round(lat_suroeste,5), round(lon_suroeste,5)]) # división SO | |
areas_divididas.append([round(lat_central,5), round(lon_noreste,5), round(lat_suroeste,5), round(lon_central,5)]) # división SE | |
areas_divididas.append([round(lat_noreste,5), round(lon_central,5), round(lat_central,5), round(lon_suroeste,5)]) # división NO | |
areas_divididas.append([round(lat_noreste,5), round(lon_noreste,5), round(lat_central,5), round(lon_central,5)]) # división NE | |
except Exception as e: | |
print('Error en iteracción') | |
print(e) | |
# area iteration | |
while True: | |
try: | |
contador = 0 | |
for area in areas_por_analizar: | |
contador = contador + 1 | |
dist = get_distance_from_latlon_km(area[0],area[1],area[2],area[3]) | |
if dist < 0.0100: continue | |
print('Area '+str(contador)+' de '+str(len(areas_por_analizar)),'(' +str(dist)+ 'km)',area) | |
iterate_area(area[0],area[1],area[2],area[3]) | |
areas_por_analizar = areas_divididas | |
areas_divididas = [] | |
contador = 0 | |
print('Reset arrays') | |
except Exception as e: | |
print('No se pudo iterar en el área\n', e) | |
break |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment