Skip to content

Instantly share code, notes, and snippets.

@gunyarakun
Created February 28, 2015 05:45
Show Gist options
  • Save gunyarakun/594ff9afd1f97ee3acfe to your computer and use it in GitHub Desktop.
Save gunyarakun/594ff9afd1f97ee3acfe to your computer and use it in GitHub Desktop.
craigslist crawling and Slack notify
#!/usr/bin/env python
# encoding: utf-8
import json
import urllib
import urllib2
import codecs
import time
from bs4 import BeautifulSoup
token = 'xoxp-xxxxxxxx-xxxxxxxx-xxxxxxxx-xxxxxx'
#crawl_url = 'http://sfbay.craigslist.org/search/cto'
crawl_url = 'http://sfbay.craigslist.org/search/cto?sort=priceasc&minAsk=3000&maxAsk=10000&auto_transmission=2&query=mini+cooper+convertible'
bot_name = 'craigs bot'
channel_name = 'craigs-check'
class SlackClient:
def __init__(self, token, bot_name='gunyabot'):
self.token = token
self.bot_name = bot_name
self.channels = self.get_channels()
@staticmethod
def encoded_dict(in_dict):
out_dict = {}
for k, v in in_dict.iteritems():
if isinstance(v, unicode):
v = v.encode('utf8')
elif isinstance(v, str):
# Must be encoded in UTF-8
v.decode('utf8')
out_dict[k] = v
return out_dict
def request(self, request='?', post_data={}, domain='slack.com'):
post_data['token'] = self.token
enc_post_data = urllib.urlencode(self.encoded_dict(post_data))
url = 'https://{}/api/{}'.format(domain, request)
response = urllib2.urlopen(url, enc_post_data)
response_str = response.read()
response_obj = json.loads(response_str)
if not response_obj['ok']:
raise ValueError
return response_obj
def get_channels(self):
d = {}
for channel in self.request('channels.list')['channels']:
d[channel['name']] = channel
return d
def post_message(self, channel_name, text, attachments=[]):
post_data = {
'channel': self.channels[channel_name]['id'],
'text': text,
'username': self.bot_name,
'attachments': json.dumps(attachments),
}
return self.request('chat.postMessage', post_data)
def parse_item_page(item_url):
f = codecs.getreader('utf-8')(urllib.urlopen(item_url))
html = f.read()
soup = BeautifulSoup(html)
posting_title_h2 = soup.find('h2', class_='postingtitle')
title = posting_title_h2.find_next(True).next_sibling
price = title.next_sibling
info = {
'title': title.string + price.string,
'title_link': item_url,
}
photo_img = soup.find('img')
if photo_img:
info['image_url'] = photo_img['src']
return info
sc = SlackClient(token, bot_name)
first_run = True
checked_item_hrefs = set()
while True:
f = codecs.getreader('utf-8')(urllib.urlopen(crawl_url))
html = f.read()
soup = BeautifulSoup(html)
item_links = soup.find_all('a', class_='hdrlnk')
for item_link in item_links:
href = item_link['href']
if href in checked_item_hrefs:
break
if not first_run:
item_url = 'http://sfbay.craigslist.org' + href
item_info = parse_item_page(item_url)
item_info['fallback'] = item_info['title']
sc.post_message(channel_name, u'Found ' + item_url, [item_info])
checked_item_hrefs.add(href)
first_run = False
time.sleep(30)
pip install beautifulsoup4
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment