Last active
August 29, 2015 14:16
-
-
Save starenka/d3fce9886dc67bc48124 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# coding=utf-8 | |
# pip install requests beautifulsoup4 | |
# TODO: handle category selection (fucked up serverside) | |
# fix seldom weird price detection | |
from __future__ import division | |
import re | |
import datetime | |
import argparse | |
import itertools | |
import smtplib | |
from email.mime.text import MIMEText | |
from email.mime.multipart import MIMEMultipart | |
import requests | |
from bs4 import BeautifulSoup | |
PER_PAGE = 50 | |
URL_BASE = 'http://hudebnibazar.cz' | |
URL_SEARCH = '?is=1&p={page}&f={term}&n={ad_type}&r=&i=%d&o=datum' % PER_PAGE | |
URL_CATEGORIES = {'guitarother': '/kytarova-pouzdra-a-prislusenstvi/110600/', | |
'guitarfx': '/kytarove-efekty/110500/', | |
'any': '/vsechny-kategorie/0/', } | |
AD_TYPES = dict(sell='nabidka', buy='poptavka', other='ruzne') | |
MONTHS = (u'ledna', u'února', u'března', u'dubna', u'května', u'června', | |
u'července', u'srpna', u'září', u'října', u'listopadu', u'prosince') | |
MONTHS_MAP = dict(zip(MONTHS, range(1, 13))) | |
def parse_date(date_str): | |
today = datetime.date.today() | |
date = today if 'dnes' in date_str else today-datetime.timedelta(days=1) | |
sdate = re.search(r'(\d+)\.\s+(\w+)', date_str, re.UNICODE) | |
if sdate: | |
day, month = sdate.groups() | |
month_num = MONTHS_MAP[month] | |
year = today.year if month_num < today.month else today.year-1 | |
date = datetime.date(year, month_num, int(day)) | |
return date | |
def parse_doc(resp, days, max_price, **kwargs): | |
doc = BeautifulSoup(resp.content) | |
for one in doc.select('td.InzeratBody'): | |
loc_data_ = one.select('div.InzeratKontakt') | |
loc_data = loc_data_[0].text if loc_data_ else '' | |
date = parse_date(loc_data) | |
if days and (datetime.date.today() - date).days > days: | |
continue | |
img_ = one.select('a.fancybox') | |
img = img_[0]['href'] if img_ else None | |
title_ = one.select('div.InzeratNadpisSmall')[0] | |
title, link = title_.text, URL_BASE + title_.find('a')['href'] | |
text = one.select('div.InzeratTextSmall')[0].text | |
price_ = one.select('div.InzeratCenaSmall') | |
price = price_[0].text if price_ else '?' | |
czkm = re.search(u'(\d+) Kč', price) | |
price_czk = int(float(czkm.groups()[0].replace(',', '.'))) if czkm else 0 | |
if max_price and not czkm: | |
continue | |
if max_price and price_czk > max_price: | |
continue | |
yield title, link, text, img, u'%d Kč' % price_czk if czkm else price, loc_data | |
def search(term, category='any', ad_type='sell', days=0, max_price=None, **kwargs): | |
base_url = URL_BASE + URL_CATEGORIES[category] | |
url = base_url + URL_SEARCH.format(term=term, ad_type=AD_TYPES[ad_type], page=1) | |
docs = [requests.get(url)] | |
total_ = re.search(r'Celkem\s(\d+)', docs[0].content) | |
total = int(total_.groups()[0])//PER_PAGE+1 | |
total = total if total < 10 else 10 # 500 items should be enough | |
for page in range(2, total+1): | |
url = base_url + URL_SEARCH.format(term=term, ad_type=AD_TYPES[ad_type], page=page) | |
next_ = requests.get(url) | |
if next_.status_code != requests.codes.ok: | |
break | |
docs.append(next_) | |
return itertools.chain.from_iterable(map(lambda x: parse_doc(x, days=days, max_price=max_price, **kwargs), docs)) | |
def mail(subj, mess, to_, from_='[email protected]'): | |
msg = MIMEMultipart('alternative') | |
p1 = MIMEText(mess, 'plain', 'utf-8') | |
msg['Subject'] = subj | |
msg['From'] = from_ | |
msg['To'] = to_ | |
msg.attach(p1) | |
s = smtplib.SMTP('localhost') | |
s.sendmail(from_, [to_], msg.as_string().encode('ascii')) | |
s.quit() | |
if __name__ == '__main__': | |
parser = argparse.ArgumentParser() | |
parser.add_argument('terms', nargs="+", help='search terms') | |
# parser.add_argument('-c', '--category', default='any', choices=URL_CATEGORIES.keys(), | |
# help='category to search (defaults to all)') | |
#parser.add_argument('-t', '--ad_type', default='sell', choices=AD_TYPES.keys(), help='ad type') | |
parser.add_argument('-p', '--max_price', type=int, default=0, help='max price') | |
parser.add_argument('-m', '--mail', default=False, help='mail to',) | |
parser.add_argument('-s', '--short', action='store_true', default=False, help='short listing',) | |
parser.add_argument('-d', '--days', type=int, default=0, help='just ads within [days]') | |
args = parser.parse_args() | |
body = '' | |
for term in args.terms: | |
for one in search(term, **vars(args)): | |
title, link, text, img, price, loc_data = one | |
body += ' '.join((title, price, loc_data)) | |
if not args.short: | |
body += '\n%s' % text | |
body += '\n%s\n\n' % ' '.join((link, img if img else '')) | |
if body: | |
if args.mail: | |
subj = u'bazarbot "%s"' % ' OR '.join(args.terms) | |
subj = subj if not args.max_price else subj + (u' < %d Kč' % args.max_price) | |
mail(subj, body, to_=args.mail) | |
else: | |
print body |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
hbazar.py 'dunlop mxr' 'boss' 'whammy' -p1600 -d1 [email protected]
#mail me any dunlop/boss/whammy hits cheaper then 1600 czk added yesterday (into your crontabz)hbazar.py 'fender jazzmaster'
#show may any ads selling jazzmaster