Last active
August 29, 2015 14:27
-
-
Save arthurdarcet/e9e7d8c80afe1c985467 to your computer and use it in GitHub Desktop.
Le Bon Coin watcher
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
import argparse | |
import bs4 | |
import email.mime.text | |
import logging | |
import logging.config | |
import re | |
import requests | |
import smtplib | |
import time | |
import urllib.parse | |
SLEEP = 30 # seconds | |
SMTP_SERVER = 'localhost' | |
EMAIL_SUBJECT = 'lbc-watcher: new match for search {q}' | |
EMAIL_FROM = '[email protected]' | |
EMAIL_TEMPLATE = """ | |
<div style="background-color: #F8F8F8; padding-top: 40px; padding-bottom: 60px;"> | |
<div style="max-width: 550px; margin: auto; background-color: #FFF; padding: 20px 50px;"> | |
<p>The following items matched your search for <code>{q}</code>:</p> | |
<ul style="list-style: none; margin-top: 30px;">{items}</ul> | |
</div> | |
</div> | |
""" | |
EMAIL_ITEM_TEMPLATE = """ | |
<li style="border-bottom: 1px solid #ddd; margin: 20px 0; clear: both; height: 120px"> | |
<img src="{image}" style="max-height: 100px; max-width: 180px; margin-right: 40px; float: left;"> | |
<a href="{url}" style="float: left; padding-top: 40px; display: inline-block; max-width="280px">{title}</a> | |
<span style="margin-left: 30px; color: #999; float: right;">{price}</span> | |
</li> | |
""" | |
logger = logging.root | |
class BC: | |
_url = 'http://www.leboncoin.fr' | |
_link = re.compile(r'{}/[a-z0-9]+/([0-9]{{8,12}})\.htm'.format(_url)) | |
def __init__(self, region): | |
self.region = region | |
def search(self, q, until=None): | |
q = urllib.parse.urlencode({'q': q}) | |
soup = self._req('annonces/offres/{}/?{}'.format(self.region, q)) | |
for a in soup.select('div.list-lbc a'): | |
m = self._link.match(a.get('href', '')) | |
if m is not None: | |
id = m.group(1) | |
if until is not None and until == id: | |
return | |
logger.debug('Found id %r for search %r', id, q) | |
# The category is useless, unneeded even to get the details page | |
def g(s, fn, *args, **kwargs): | |
els = a.select(s) | |
if not els: return None | |
return getattr(els[0], fn)(*args, **kwargs) | |
yield { | |
'id': id, | |
'url': a.get('href', None), | |
'title': g('.title', 'get_text', strip=True), | |
'price': g('.price', 'get_text', strip=True), | |
'image': g('.image img', 'get', 'src', None), | |
} | |
def _req(self, p): | |
url = '{}/{}'.format(self._url, p) | |
logger.debug('LeBonCoin request: %r', url) | |
req = requests.get(url) | |
return bs4.BeautifulSoup(req.content, 'html.parser') | |
class RollingBC(BC): | |
def __init__(self, *args, **kwargs): | |
self.seen = {} | |
super().__init__(*args, **kwargs) | |
def search(self, q): | |
last = self.seen.get(q, None) | |
new = list(super().search(q, until=last)) | |
if not new: return | |
self.seen[q] = new[0]['id'] | |
return new | |
def send_alert(to, q, items): | |
logger.debug('Notifying %s with %s items for search %r', to, len(items), q) | |
content = EMAIL_TEMPLATE.format( | |
q=q, | |
items=''.join(EMAIL_ITEM_TEMPLATE.format(**it) for it in items), | |
) | |
msg = email.mime.text.MIMEText(content, 'html') | |
msg['Subject'] = EMAIL_SUBJECT.format(q=q) | |
msg['From'] = EMAIL_FROM | |
msg['To'] = to | |
smtp = smtplib.SMTP(SMTP_SERVER) | |
smtp.send_message(msg) | |
smtp.close() | |
parser = argparse.ArgumentParser() | |
parser.add_argument('-d', '--debug', action='store_true', help='Log debug messages') | |
parser.add_argument('-r', '--region', nargs='?', help='Region', default='ile_de_france') | |
parser.add_argument('-e', '--email', nargs='?', help='Email to send the alert to', default=None) | |
parser.add_argument('searches', nargs='+', help='Searches to perform') | |
args = parser.parse_args() | |
logging.config.dictConfig({ | |
'version': 1, | |
'handlers': { | |
'console': { | |
'class': 'logging.StreamHandler', | |
'formatter': 'clean', | |
}, | |
}, | |
'formatters': { | |
'clean': { | |
'format' : '{asctime} | {levelname:^8} | {message}', | |
'datefmt' : '%Y-%m-%d %H:%M:%S', | |
'style': '{', | |
}, | |
}, | |
'root': { | |
'handlers': ['console'], | |
'level': 'DEBUG' if args.debug else 'INFO', | |
} | |
}) | |
bc = RollingBC(args.region) | |
for q in args.searches: | |
bc.search(q) | |
while True: | |
time.sleep(SLEEP) | |
logger.debug('Wakeup') | |
for q in args.searches: | |
new = bc.search(q) | |
if not new: continue | |
for it in new: | |
logger.info('Found new result for search %r: %s', q, it) | |
if args.email: | |
send_alert(args.email, q, new) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment