Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Select an option

  • Save fogonthedowns/ed03eb690002978f83748114300feb58 to your computer and use it in GitHub Desktop.

Select an option

Save fogonthedowns/ed03eb690002978f83748114300feb58 to your computer and use it in GitHub Desktop.
Fix Craigslist
from bs4 import BeautifulSoup
import logging
try:
from Queue import Queue # PY2
except ImportError:
from queue import Queue # PY3
import requests
from requests.exceptions import RequestException
from six import iteritems
from threading import Thread
try:
from urlparse import urljoin # PY2
except ImportError:
from urllib.parse import urljoin # PY3
from .sites import get_all_sites
# Logging
logger = logging.getLogger('python-craiglist')
handler = logging.StreamHandler()
logger.addHandler(handler)
# Global
all_sites = get_all_sites() # All the Craiglist sites
results_per_request = 100 # Craigslist returns 100 results per request
def requests_get(*args, **kwargs):
"""
Retries if a RequestException is raised (could be a connection error or
a timeout).
"""
try:
return requests.get(*args, **kwargs)
except RequestException as exc:
logger.warning('Request failed (%s). Retrying ...', exc)
return requests.get(*args, **kwargs)
class CraigslistBase(object):
""" Base class for all Craiglist wrappers. """
url_templates = {
'base': 'http://%(site)s.craigslist.org',
'no_area': 'http://%(site)s.craigslist.org/search/%(category)s',
'area': 'http://%(site)s.craigslist.org/search/%(area)s/%(category)s'
}
default_category = None
base_filters = {
'query': {'url_key': 'query', 'value': None},
'search_titles': {'url_key': 'srchType', 'value': 'T'},
'has_image': {'url_key': 'hasPic', 'value': 1},
'posted_today': {'url_key': 'postedToday', 'value': 1},
}
extra_filters = {}
sort_by_options = {
'newest': 'date',
'price_asc': 'priceasc',
'price_desc': 'pricedsc',
}
def __init__(self, site='sfbay', area=None, category=None, filters=None,
log_level=logging.WARNING):
self.set_logger(log_level)
# if site not in all_sites:
# msg = "'%s' is not a valid site" % site
# logger.error(msg)
# logger.error(all_sites)
# raise ValueError(all_sites)
self.site = site
if area:
base_url = self.url_templates['base']
response = requests_get(base_url % {'site': self.site})
soup = BeautifulSoup(response.content)
sublinks = soup.find('ul', {'class': 'sublinks'})
if not sublinks or not sublinks.find('a', text=area):
msg = "'%s' is not a valid area for site '%s'" % (area, site)
logger.error(msg)
raise ValueError(msg)
self.area = area
self.category = category or self.default_category
url_template = self.url_templates['area' if area else 'no_area']
self.url = url_template % {'site': self.site, 'area': self.area,
'category': self.category}
self.filters = {}
for key, value in iteritems((filters or {})):
try:
filter = self.base_filters.get(key) or self.extra_filters[key]
self.filters[filter['url_key']] = filter['value'] or value
except KeyError:
logger.warning("'%s' is not a valid filter", key)
def set_logger(self, log_level):
logger.setLevel(log_level)
handler.setLevel(log_level)
def get_results(self, limit=None, sort_by=None, geotagged=False):
"""
Get results from Craigslist based on the specified filters.
If geotagged=True, the results will include the (lat, lng) in the
'geotag' attrib (this will make the process a little bit longer).
"""
if sort_by:
try:
self.filters['sort'] = self.sort_by_options[sort_by]
except KeyError:
msg = ("'%s' is not a valid sort_by option, "
"use: 'newest', 'price_asc' or 'price_desc'" % sort_by)
logger.error(msg)
raise ValueError(msg)
start = 0
total_so_far = 0
total = 0
while True:
self.filters['s'] = start
response = requests_get(self.url, params=self.filters)
logger.info('GET %s', response.url)
logger.info('Response code: %s', response.status_code)
response.raise_for_status() # Something failed?
soup = BeautifulSoup(response.content)
if not total:
totalcount = soup.find('span', {'class': 'totalcount'})
total = int(totalcount.text) if totalcount else 0
for row in soup.find_all('p', {'class': 'row'}):
if limit is not None and total_so_far >= limit:
break
logger.debug('Processing %s of %s results ...',
total_so_far + 1, total)
link = row.find('a', {'class': 'hdrlnk'})
id = link.attrs['data-id']
name = link.text
url = urljoin(self.url, link.attrs['href'])
time = row.find('time')
if time:
datetime = time.attrs['datetime']
else:
pl = row.find('span', {'class': 'pl'})
datetime = pl.text.split(':')[0].strip() if pl else None
price = row.find('span', {'class': 'price'})
where = row.find('small')
p_text = row.find('span', {'class': 'p'}).text
result = {'id': id,
'name': name,
'url': url,
'datetime': datetime,
'price': price.text if price else None,
'where': where.text.strip('() ') if where else None,
'has_image': 'pic' in p_text,
'has_map': 'map' in p_text,
'geotag': None}
if geotagged:
self.geotag_result(result)
yield result
total_so_far += 1
if total_so_far == limit:
break
if (total_so_far - start) < results_per_request:
break
start = total_so_far
def geotag_result(self, result):
""" Adds (lat, lng) to result. """
logger.debug('Geotagging result ...')
if result['has_map']:
response = requests_get(result['url'])
logger.info('GET %s', response.url)
logger.info('Response code: %s', response.status_code)
if response.ok:
soup = BeautifulSoup(response.content)
map = soup.find('div', {'id': 'map'})
if map:
result['geotag'] = (float(map.attrs['data-latitude']),
float(map.attrs['data-longitude']))
return result
def geotag_results(self, results, workers=8):
"""
Add (lat, lng) to each result. This process is done using N threads,
where N is the amount of workers defined (default: 8).
"""
results = list(results)
queue = Queue()
for result in results:
queue.put(result)
def geotagger():
while not queue.empty():
logger.debug('%s results left to geotag ...', queue.qsize())
self.geotag_result(queue.get())
queue.task_done()
threads = []
for _ in xrange(workers):
thread = Thread(target=geotagger)
thread.start()
threads.append(thread)
for thread in threads:
thread.join()
return results
class CraigslistCommunity(CraigslistBase):
""" Craigslist community wrapper. """
default_category = 'ccc'
class CraigslistEvents(CraigslistBase):
""" Craigslist events wrapper. """
default_category = 'eee'
extra_filters = {
'art': {'url_key': 'event_art', 'value': 1},
'athletics': {'url_key': 'event_athletics', 'value': 1},
'career': {'url_key': 'event_career', 'value': 1},
'dance': {'url_key': 'event_dance', 'value': 1},
'festival': {'url_key': 'event_festical', 'value': 1},
'fitness': {'url_key': 'event_fitness_wellness', 'value': 1},
'health': {'url_key': 'event_fitness_wellness', 'value': 1},
'food': {'url_key': 'event_food', 'value': 1},
'drink': {'url_key': 'event_food', 'value': 1},
'free': {'url_key': 'event_free', 'value': 1},
'fundraiser': {'url_key': 'event_fundraiser_vol', 'value': 1},
'tech': {'url_key': 'event_geek', 'value': 1},
'kid_friendly': {'url_key': 'event_kidfriendly', 'value': 1},
'literacy': {'url_key': 'event_literacy', 'value': 1},
'music': {'url_key': 'event_music', 'value': 1},
'outdoor': {'url_key': 'event_outdoor', 'value': 1},
'sale': {'url_key': 'event_sale', 'value': 1},
'singles': {'url_key': 'event_singles', 'value': 1},
}
class CraigslistForSale(CraigslistBase):
""" Craigslist for sale wrapper. """
default_category = 'sss'
extra_filters = {
'min_price': {'url_key': 'minAsk', 'value': None},
'max_price': {'url_key': 'maxAsk', 'value': None},
'make': {'url_key': 'autoMakeModel', 'value': None},
'model': {'url_key': 'autoMakeModel', 'value': None},
'min_year': {'url_key': 'autoMinYear', 'value': None},
'max_year': {'url_key': 'autoMaxYear', 'value': None},
'min_miles': {'url_key': 'autoMilesMin', 'value': None},
'max_miles': {'url_key': 'autoMilesMax', 'value': None},
}
class CraigslistGigs(CraigslistBase):
""" Craigslist gigs wrapper. """
default_category = 'ggg'
extra_filters = {
'is_paid': {'url_key': 'is_paid', 'value': None},
}
def __init__(self, *args, **kwargs):
try:
is_paid = kwargs['filters']['is_paid']
kwargs['filters']['is_paid'] = 'yes' if is_paid else 'no'
except KeyError:
pass
super(CraigslistGigs, self).__init__(*args, **kwargs)
class CraigslistHousing(CraigslistBase):
""" Craigslist housing wrapper. """
default_category = 'hhh'
extra_filters = {
'private_room': {'url_key': 'private_room', 'value': 1},
'private_bath': {'url_key': 'private_bath', 'value': 1},
'cats_ok': {'url_key': 'pets_cat', 'value': 1},
'dogs_ok': {'url_key': 'pets_dog', 'value': 1},
'min_price': {'url_key': 'minAsk', 'value': None},
'max_price': {'url_key': 'maxAsk', 'value': None},
'min_ft2': {'url_key': 'minSqft', 'value': None},
'max_ft2': {'url_key': 'maxSqft', 'value': None},
'search_distance': {'url_key': 'search_distance', 'value': None},
'zip_code': {'url_key': 'postal', 'value': None},
}
class CraigslistJobs(CraigslistBase):
""" Craigslist jobs wrapper. """
default_category = 'jjj'
extra_filters = {
'is_contract': {'url_key': 'is_contract', 'value': 1},
'is_internship': {'url_key': 'is_internship', 'value': 1},
'is_nonprofit': {'url_key': 'is_nonprofit', 'value': 1},
'is_parttime': {'url_key': 'is_parttime', 'value': 1},
'is_telecommuting': {'url_key': 'is_telecommuting', 'value': 1},
}
class CraigslistPersonals(CraigslistBase):
""" Craigslist personals wrapper. """
default_category = 'ppp'
extra_filters = {
'min_age': {'url_key': 'minAsk', 'value': None},
'max_age': {'url_key': 'maxAsk', 'value': None},
}
class CraigslistResumes(CraigslistBase):
""" Craigslist resumes wrapper. """
default_category = 'rrr'
class CraigslistServices(CraigslistBase):
""" Craigslist services wrapper. """
default_category = 'bbb'
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment