Last active
August 29, 2015 14:04
-
-
Save agonzalezro/440e7bf41e77c284d735 to your computer and use it in GitHub Desktop.
Scrape few pages on gumtree to find a place that match your criteria
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import asyncio | |
import bs4 | |
import requests | |
import re | |
import tqdm | |
from csv import writer | |
from datetime import datetime | |
from urllib import parse | |
FROM_DATE = datetime(2014, 9, 25) | |
TO_DATE = datetime(2014, 10, 5) | |
BASE_URL = ( | |
'http://www.gumtree.com/flats-and-houses-for-rent-offered/london/page{}?{}' | |
) | |
QUERY_PARAMS = { | |
'min_property_number_beds': 2, | |
'max_property_number_beds': 2, | |
'min_price': 280, | |
'max_price': 340, | |
'photos_filter': 'Y', | |
'search_location': 'London', | |
'category': 'flats-and-houses-for-rent-offered' | |
} | |
DEPTH = 150 | |
sem = asyncio.Semaphore(5) | |
class CSV: | |
def __init__(self, csvfile): | |
self.writer = writer(csvfile, delimiter=';') | |
self.writer.writerow( | |
[ | |
'date', | |
'pcm', | |
'pw', | |
'location', | |
'lat', | |
'lon', | |
'link', | |
'description' | |
] | |
) | |
def write(self, date, pw, location, lat, lon, link, description): | |
pw = re.search('\d+', pw).group(0) | |
pcm = float(pw) * 52 / 12 if pw else None | |
location = location.replace('"', '') | |
self.writer.writerow( | |
[date, '%.2f' % pcm, pw, location, lat, lon, link, description] | |
) | |
def _get_lat_lon(url): | |
"""Kinda ugly, but it does the job.""" | |
response = requests.get(url) | |
content = str(response.content) | |
ltlng = re.search('ltlng":"(-?\d+.\d+);(-?\d+.\d+)', content) | |
if ltlng: | |
return ltlng.group(1), ltlng.group(2) | |
return 0, 0 | |
@asyncio.coroutine | |
def write_filtered_flats(page, csv): | |
url = BASE_URL.format(page, parse.urlencode(QUERY_PARAMS)) | |
with (yield from sem): | |
response = requests.get(url) | |
soup = bs4.BeautifulSoup(response.content) | |
for li in soup.select('li.offer-sale'): | |
a = li.select('> a.description')[0] | |
link = a.attrs['href'] | |
description = a.attrs['title'] | |
pw = li.select('span.price')[0].text | |
location = li.select('span.location')[0].text | |
raw_date = li.select( | |
'> div.location-and-date .displayed-date' | |
)[0].text | |
date = datetime.strptime(raw_date, '%d/%m/%y') | |
if date >= FROM_DATE and date <= TO_DATE: | |
# Get the proper location, if not, avoid them | |
lat, lon = _get_lat_lon(link) | |
if lat and lon: | |
csv.write( | |
raw_date, pw, location, lat, lon, link, description | |
) | |
@asyncio.coroutine | |
def wait_with_progress(coros): | |
for f in tqdm.tqdm(asyncio.as_completed(coros), total=len(coros)): | |
yield from f | |
if __name__ == '__main__': | |
loop = asyncio.get_event_loop() | |
with open('/tmp/flats.csv', 'w', newline='') as csvfile: | |
csv = CSV(csvfile) | |
f = wait_with_progress( | |
[ | |
write_filtered_flats(page, csv) | |
for page in range(1, DEPTH + 1) | |
] | |
) | |
loop.run_until_complete(f) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment