Last active
August 29, 2015 14:03
-
-
Save mauromarano/69808698f39ae1e33f88 to your computer and use it in GitHub Desktop.
Controlla su subito se è stata inserita una moto che risponde a determinati criteri. Se la trova ti notifica via email
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#! /usr/bin/env python | |
#-*- coding: utf-8 -*- | |
# HOW TO: | |
# set the top class variables | |
# crontab -e and add the command to auto-lounch this command each hour: 1 * * * * * python spider.py | |
from bs4 import BeautifulSoup | |
import requests | |
import re | |
import smtplib | |
from email.mime.multipart import MIMEMultipart | |
from email.mime.text import MIMEText | |
class Scraper: | |
min_750_url = 'http://www.subito.it/annunci-calabria/vendita/moto-e-scooter/?ccs=750' | |
patterns = ['r1','gsxr', 'gsx-r', 'cbr', 'ninja', 'fz1', 'z1000', 'speed'] | |
data_file = 'database.txt' | |
bikes = [] | |
base_url = 'http://www.subito.it' | |
gmail_user = '' | |
gmail_password = '' | |
to_email = '' | |
from_email = '[email protected]' | |
def __init__(self): | |
self.soup = self.get_source(self.min_750_url) | |
if (self.check_for_data()== False): | |
self.create_data_file() | |
self.get_bikes() | |
def get_source(self, url): | |
"""get the search html source""" | |
r = requests.get(url) | |
source = r.text | |
return BeautifulSoup(source) | |
def check_for_data(self): | |
"""check if the data files exists""" | |
try: | |
with open(self.data_file): | |
return True | |
except: | |
return False | |
def create_data_file(self): | |
"""create an empty data file""" | |
try: | |
f = open(self.data_file, 'w') | |
f.write('') | |
f.close() | |
return True | |
except: | |
return False | |
def get_bikes(self): | |
"""get the general information""" | |
for bike in self.soup.select('.list > li'): | |
title = bike.find('strong').string | |
for pattern in self.patterns: | |
if(re.search(pattern, title, re.I)): | |
#title | |
url = self.base_url + bike.find('a')['href'] | |
price = bike.find(class_='price') | |
price = re.search('(.+)\s€', str(price)).group(1) | |
city = bike.find(class_='city').p.string | |
city = re.search('\w+', str(city)).group() | |
time = bike.find(class_='date') | |
time = re.search('(\d\d:\d\d)', str(time)).group(1) | |
obj = { | |
'url' : url, | |
'price' : price, | |
'title': title, | |
'city' : city, | |
'time' : time | |
} | |
if self.is_old(url) == False: | |
bike = self.get_source(url) | |
obj['body_text'] = bike.find(id='body_txt') | |
infos = bike.find(class_='annuncio_info').find_all('li') | |
comune = '' | |
km = '' | |
for info in infos: | |
m = re.search('comune.+>(.+)<',str(info),re.I) | |
if (m is not None): | |
comune = m.group(1) | |
m = re.search('km.+>(.+)<',str(info),re.I) | |
if (m is not None): | |
km = m.group(1) | |
obj['comune'] = comune | |
obj['km'] = km | |
try: | |
image = bike.find(id='display_image')['style'] | |
obj['image'] = re.search('url\(\'(http.+)\'\)',str(image), re.I).group(1) | |
except: | |
obj['image'] = 'No image' | |
print '[*] No image found for: ' + title | |
self.send_email(obj) | |
def send_email(self,obj): | |
"""send the email""" | |
emailto = self.to_email | |
emailfrom = self.from_email | |
message = MIMEMultipart('alternative') | |
message['To'] = ", ".join(emailto) | |
message['From'] = emailfrom | |
message['Subject'] = obj['title'] | |
body='<a href="'+obj['url']+'"><h1>'+obj['title']+'</h1></a><br>' | |
body +='<ul><li> km ' + obj['km']+ '</li>' | |
body +='<li> luogo ' +obj['comune'] + '(' + obj['city']+ ')</li>' | |
body +='<li>prezzo '+obj['price'] + ' euro</li></ul><br>' | |
body +='<img src="'+obj['image']+'">' | |
htmlemailmessage = unicode(body) | |
raw_message = 'Please visit ' + obj['url'] | |
#Create your plain text message | |
plaintextemailmessage = unicode(raw_message) | |
#Add the HTML and plain text messages to the message info list (array) | |
storeplain = MIMEText(plaintextemailmessage, 'plain') | |
storehtml = MIMEText(htmlemailmessage, 'html') | |
message.attach(storeplain) | |
message.attach(storehtml) | |
#If you're using an exchange server, port 587 should work as well | |
deetsurl = smtplib.SMTP("smtp.gmail.com", 587) | |
deetsuser = self.gmail_user | |
deetspassword = self.gmail_password | |
# Connect to the SMTP server and authenticate using TLS encryption | |
#If using exchange this should work as well as long as TLS is enabled on the server | |
deetsurl.ehlo() | |
deetsurl.starttls() | |
deetsurl.ehlo() | |
deetsurl.login(deetsuser, deetspassword) | |
#Send our prepared message to the SMTP server for a request to send | |
deetsurl.sendmail(emailfrom, emailto, message.as_string()) | |
#All done so print a message saying the email has been sent, and quit the SMTP session | |
print "Email sent." | |
deetsurl.quit() | |
def is_old(self, url): | |
"""check if a bike has been already crawled""" | |
with open(self.data_file) as f: | |
for line in f: | |
if line.strip() == url: | |
print '[*] Already crawled' | |
return True | |
with open(self.data_file,'a') as f: | |
f.write(url+ '\n') | |
return False | |
scraper = Scraper() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment