Last active
November 17, 2021 22:00
-
-
Save moloch--/ce04f5623ec3161bb1fd to your computer and use it in GitHub Desktop.
Criagslist Bot/Parser
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
###################################### | |
# | |
# Author: Moloch | |
# | |
# Required libs: | |
# pip install requests | |
# pip install beautifulsoup4 | |
# pip install PyRSS2Gen | |
# pip install python-dateutil | |
###################################### | |
import sys | |
import logging | |
import requests | |
import argparse | |
import platform | |
import PyRSS2Gen | |
import xml.dom.minidom | |
from urllib import urlencode | |
from urlparse import urljoin | |
from datetime import datetime | |
from bs4 import BeautifulSoup | |
from dateutil import parser as dateparser | |
SEARCH_URL = 'https://sfbay.craigslist.org/search/apa/' | |
if platform.system().lower() in ['linux', 'darwin']: | |
INFO = "\033[1m\033[36m[*]\033[0m " | |
WARN = "\033[1m\033[31m[!]\033[0m " | |
BOLD = "\033[1m" | |
else: | |
INFO = "[*] " | |
WARN = "[!] " | |
BOLD = "" | |
def print_info(msg): | |
''' Clearline and print message ''' | |
sys.stdout.write(chr(27) + '[2K') | |
sys.stdout.write('\r' + INFO + msg) | |
sys.stdout.flush() | |
class Advertisement(object): | |
headers = { | |
'User-Agent': "Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0)" | |
} | |
_soup = None | |
def __init__(self, area_code, p): | |
self.area_code = area_code | |
self.pid = p.attrs['data-pid'] | |
self.geo = p | |
self.price = p | |
self._make_soup() | |
@property | |
def title(self): | |
return self._soup.title.text.strip() | |
@property | |
def created(self): | |
posted = self._soup.find('time').attrs['datetime'] | |
return dateparser.parse(posted) | |
@property | |
def geo(self): | |
''' Returns a tuple of lat, long ''' | |
return (self.latitude, self.longitude,) | |
@geo.setter | |
def geo(self, p): | |
self.latitude = float(p.attrs.get('data-latitude', 0)) | |
self.longitude = float(p.attrs.get('data-longitude', 0)) | |
@property | |
def price(self): | |
return self._price | |
@price.setter | |
def price(self, p): | |
tag = p.find('span', attrs={"class": "price"}) | |
self._price = int(tag.text.replace('$', '')) if tag is not None else 0 | |
@property | |
def href(self): | |
return "https://sfbay.craigslist.org/%s/apa/%s.html" % (self.area_code, self.pid) | |
@property | |
def images(self): | |
''' Returns URLs for related images ''' | |
thumbs = self._soup.find('div', attrs={'id': 'thumbs'}) | |
return [ | |
a.attrs['href'] for a in thumbs.find_all('a', attrs={'href': True}) | |
] if thumbs else [] | |
@property | |
def description(self): | |
body = self._soup.find('section', attrs={'id': 'postingbody'}) | |
text = [tag.text for tag in body.children if hasattr(tag, 'text')] | |
return ''.join(text) | |
def _make_soup(self): | |
resp = requests.get(self.href, headers=self.headers) | |
self._soup = BeautifulSoup(resp.text, "html5lib") | |
def __cmp__(self, other): | |
if other.price < self.price: | |
return 1 | |
elif self.price == other.price: | |
return 0 | |
else: | |
return -1 | |
def __eq__(self, other): | |
return self.pid == other.pid | |
def __ne__(self, other): | |
return not self == other | |
def __str__(self): | |
return self.title | |
def __repr__(self): | |
return '<Advertisement price: $%s, geo: %s, href: %s>' % ( | |
self.price, self.geo, self.href | |
) | |
class Craigslist(object): | |
headers = { | |
'User-Agent': "Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0)" | |
} | |
area_codes = { | |
'sf': 'sfc', | |
'east-bay': 'eby', | |
} | |
neigborhood_codes = { | |
'sf': { | |
'bayview': 2, | |
'castro': 4, | |
'upper market': 4, | |
'mission district': 18, | |
'nob hill': 19, | |
'potrero hill': 25, | |
'russian hill': 27, | |
}, | |
'east-bay': { | |
'berkeley': 48, | |
'north berkeley': 49, | |
'berkeley hills': 49, | |
} | |
} | |
def __init__(self, area, min_rent='', max_rent='', beds='', neighborhoods=[], cats=False, dogs=False): | |
params = { | |
'minAsk': min_rent, | |
'maxAsk': max_rent, | |
'bedrooms': beds, | |
} | |
if area not in self.area_codes: | |
raise NotImplementedError("That area is not implemented yet") | |
else: | |
self.area_code = self.area_codes[area] | |
self.query = SEARCH_URL + self.area_code + '?' + urlencode(params) | |
for hood in neighborhoods: | |
self.query += '&nh=%d' % self.neigborhood_codes[area][hood] | |
if cats: | |
self.query += '&addTwo=purrr' | |
if dogs: | |
self.query += '&addThree=wooof' | |
self._make_soup(self.query) | |
self._pages() | |
@classmethod | |
def neighborhoods(cls, area): | |
if area not in cls.neigborhood_codes: | |
raise NotImplementedError("That area is not implemented yet") | |
return cls.neigborhood_codes[area].keys() | |
def _make_soup(self, url): | |
self._response = requests.get(url, headers=self.headers) | |
self._soup = BeautifulSoup(self._response.text) | |
def _pages(self): | |
self._paragraphs = self._soup.find_all('p', attrs={'data-pid': True}) | |
if 100 < len(self): | |
for index in range(100, len(self), 100): | |
self._make_soup(self.query + '&s=%d' % index) | |
self._paragraphs += self._soup.find_all( | |
'p', attrs={'data-pid': True}) | |
def __iter__(self): | |
''' Pull ad links ''' | |
for p in self._paragraphs: | |
yield Advertisement(self.area_code, p) | |
def __len__(self): | |
count = self._soup.find('span', attrs={'class': 'resultcount'}) | |
return int(count.text) if count else 0 | |
def create_rss(craigslist, title, link, description): | |
''' Instanciate and return an RSS object ''' | |
items = [] | |
for index, ad in enumerate(craigslist): | |
print_info("Retrieving RSS item %d of %d" % | |
(index + 1, len(craigslist))) | |
item = PyRSS2Gen.RSSItem( | |
title=ad.title, | |
link=ad.href, | |
description=ad.description, | |
guid=PyRSS2Gen.Guid(ad.href), | |
pubDate=ad.created, | |
) | |
items.append(item) | |
print_info("Successfully retrieved all items\n") | |
return PyRSS2Gen.RSS2( | |
title=title, | |
link=link, | |
description=description, | |
lastBuildDate=datetime.now(), | |
items=items, | |
) | |
if __name__ == '__main__': | |
def _cli(args): | |
try: | |
craig = Craigslist( | |
area=args.area, | |
min_rent=args.min, | |
max_rent=args.max, | |
beds=args.beds, | |
cats=args.cats, | |
dogs=args.dogs, | |
neighborhoods=Craigslist.neighborhoods(args.area), | |
) | |
print_info('Found %d results ...\n' % len(craig)) | |
if not args.rss: | |
for index, ad in enumerate(craig): | |
print "%d) $%d - %s" % (index + 1, ad.price, unicode(ad)) | |
if args.links: | |
print "\t%s" % ad.href | |
if args.images: | |
for image in ad.images: | |
print "\t%s" % image | |
else: | |
rss = create_rss(craig, "Craigslist2Rss", "", "") | |
print_info("Writing data to file ...") | |
with open(args.rss, "w") as fp: | |
doc = xml.dom.minidom.parseString(rss.to_xml()) | |
rss_xml = doc.toprettyxml().encode('utf-8') | |
fp.write(rss_xml) | |
print_info("Wrote %d byte(s) to %s\n" % | |
(len(rss_xml), args.rss)) | |
except KeyboardInterrupt: | |
print chr(27) + '[2K\r' + WARN + 'Stopping ...' | |
except NotImplementedError as error: | |
print chr(27) + '[2K\r' + WARN + str(error) | |
parser = argparse.ArgumentParser( | |
description='Craigslist parser/scraper', | |
) | |
parser.add_argument('--version', | |
action='version', | |
version='%(prog)s v0.0.1' | |
) | |
parser.add_argument('--verbose', '-v', | |
help='display verbose output (default: false)', | |
action='store_true', | |
dest='verbose', | |
) | |
parser.add_argument('--area', '-a', | |
help='specify a search area %s' % Craigslist.area_codes.keys(), | |
dest='area', | |
required=True, | |
) | |
parser.add_argument('--beds', '-b', | |
help='min number of beds', | |
type=int, | |
dest='beds', | |
) | |
parser.add_argument('--min-rent', '-mn', | |
help='min monthly rent', | |
type=int, | |
default=0, | |
dest='min', | |
) | |
parser.add_argument('--max-rent', '-mx', | |
help='max monthly rent', | |
type=int, | |
default=100000, | |
dest='max', | |
) | |
parser.add_argument('--dogs', '-d', | |
help='allows dogs', | |
action='store_true', | |
dest='dogs', | |
) | |
parser.add_argument('--cats', '-c', | |
help='allows cats', | |
action='store_true', | |
dest='cats', | |
) | |
parser.add_argument('--images', '-i', | |
help='display image links', | |
action='store_true', | |
dest='images', | |
) | |
parser.add_argument('--links', '-l', | |
help='display links to ad', | |
action='store_true', | |
dest='links', | |
) | |
parser.add_argument('--rss', '-r', | |
help='output results to an rss formatted xml file', | |
dest='rss', | |
) | |
args = parser.parse_args() | |
if args.verbose: | |
logger = logging.getLogger() | |
logging.basicConfig( | |
format='[%(levelname)s] %(asctime)s - %(message)s', | |
level=logging.DEBUG | |
) | |
_cli(args) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment