Skip to content

Instantly share code, notes, and snippets.

@agonzalezro
Last active August 29, 2015 14:04
Show Gist options
  • Save agonzalezro/440e7bf41e77c284d735 to your computer and use it in GitHub Desktop.
Save agonzalezro/440e7bf41e77c284d735 to your computer and use it in GitHub Desktop.
Scrape few pages on gumtree to find a place that match your criteria
import asyncio
import bs4
import requests
import re
import tqdm
from csv import writer
from datetime import datetime
from urllib import parse
FROM_DATE = datetime(2014, 9, 25)
TO_DATE = datetime(2014, 10, 5)
BASE_URL = (
'http://www.gumtree.com/flats-and-houses-for-rent-offered/london/page{}?{}'
)
QUERY_PARAMS = {
'min_property_number_beds': 2,
'max_property_number_beds': 2,
'min_price': 280,
'max_price': 340,
'photos_filter': 'Y',
'search_location': 'London',
'category': 'flats-and-houses-for-rent-offered'
}
DEPTH = 150
sem = asyncio.Semaphore(5)
class CSV:
def __init__(self, csvfile):
self.writer = writer(csvfile, delimiter=';')
self.writer.writerow(
[
'date',
'pcm',
'pw',
'location',
'lat',
'lon',
'link',
'description'
]
)
def write(self, date, pw, location, lat, lon, link, description):
pw = re.search('\d+', pw).group(0)
pcm = float(pw) * 52 / 12 if pw else None
location = location.replace('"', '')
self.writer.writerow(
[date, '%.2f' % pcm, pw, location, lat, lon, link, description]
)
def _get_lat_lon(url):
"""Kinda ugly, but it does the job."""
response = requests.get(url)
content = str(response.content)
ltlng = re.search('ltlng":"(-?\d+.\d+);(-?\d+.\d+)', content)
if ltlng:
return ltlng.group(1), ltlng.group(2)
return 0, 0
@asyncio.coroutine
def write_filtered_flats(page, csv):
url = BASE_URL.format(page, parse.urlencode(QUERY_PARAMS))
with (yield from sem):
response = requests.get(url)
soup = bs4.BeautifulSoup(response.content)
for li in soup.select('li.offer-sale'):
a = li.select('> a.description')[0]
link = a.attrs['href']
description = a.attrs['title']
pw = li.select('span.price')[0].text
location = li.select('span.location')[0].text
raw_date = li.select(
'> div.location-and-date .displayed-date'
)[0].text
date = datetime.strptime(raw_date, '%d/%m/%y')
if date >= FROM_DATE and date <= TO_DATE:
# Get the proper location, if not, avoid them
lat, lon = _get_lat_lon(link)
if lat and lon:
csv.write(
raw_date, pw, location, lat, lon, link, description
)
@asyncio.coroutine
def wait_with_progress(coros):
for f in tqdm.tqdm(asyncio.as_completed(coros), total=len(coros)):
yield from f
if __name__ == '__main__':
loop = asyncio.get_event_loop()
with open('/tmp/flats.csv', 'w', newline='') as csvfile:
csv = CSV(csvfile)
f = wait_with_progress(
[
write_filtered_flats(page, csv)
for page in range(1, DEPTH + 1)
]
)
loop.run_until_complete(f)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment