Skip to content

Instantly share code, notes, and snippets.

@mauromarano
Last active August 29, 2015 14:03
Show Gist options
  • Save mauromarano/69808698f39ae1e33f88 to your computer and use it in GitHub Desktop.
Save mauromarano/69808698f39ae1e33f88 to your computer and use it in GitHub Desktop.
Controlla su subito se è stata inserita una moto che risponde a determinati criteri. Se la trova ti notifica via email
#! /usr/bin/env python
#-*- coding: utf-8 -*-
# HOW TO:
# set the top class variables
# crontab -e and add the command to auto-lounch this command each hour: 1 * * * * * python spider.py
from bs4 import BeautifulSoup
import requests
import re
import smtplib
from email.mime.multipart import MIMEMultipart
from email.mime.text import MIMEText
class Scraper:
min_750_url = 'http://www.subito.it/annunci-calabria/vendita/moto-e-scooter/?ccs=750'
patterns = ['r1','gsxr', 'gsx-r', 'cbr', 'ninja', 'fz1', 'z1000', 'speed']
data_file = 'database.txt'
bikes = []
base_url = 'http://www.subito.it'
gmail_user = ''
gmail_password = ''
to_email = ''
from_email = '[email protected]'
def __init__(self):
self.soup = self.get_source(self.min_750_url)
if (self.check_for_data()== False):
self.create_data_file()
self.get_bikes()
def get_source(self, url):
"""get the search html source"""
r = requests.get(url)
source = r.text
return BeautifulSoup(source)
def check_for_data(self):
"""check if the data files exists"""
try:
with open(self.data_file):
return True
except:
return False
def create_data_file(self):
"""create an empty data file"""
try:
f = open(self.data_file, 'w')
f.write('')
f.close()
return True
except:
return False
def get_bikes(self):
"""get the general information"""
for bike in self.soup.select('.list > li'):
title = bike.find('strong').string
for pattern in self.patterns:
if(re.search(pattern, title, re.I)):
#title
url = self.base_url + bike.find('a')['href']
price = bike.find(class_='price')
price = re.search('(.+)\s€', str(price)).group(1)
city = bike.find(class_='city').p.string
city = re.search('\w+', str(city)).group()
time = bike.find(class_='date')
time = re.search('(\d\d:\d\d)', str(time)).group(1)
obj = {
'url' : url,
'price' : price,
'title': title,
'city' : city,
'time' : time
}
if self.is_old(url) == False:
bike = self.get_source(url)
obj['body_text'] = bike.find(id='body_txt')
infos = bike.find(class_='annuncio_info').find_all('li')
comune = ''
km = ''
for info in infos:
m = re.search('comune.+>(.+)<',str(info),re.I)
if (m is not None):
comune = m.group(1)
m = re.search('km.+>(.+)<',str(info),re.I)
if (m is not None):
km = m.group(1)
obj['comune'] = comune
obj['km'] = km
try:
image = bike.find(id='display_image')['style']
obj['image'] = re.search('url\(\'(http.+)\'\)',str(image), re.I).group(1)
except:
obj['image'] = 'No image'
print '[*] No image found for: ' + title
self.send_email(obj)
def send_email(self,obj):
"""send the email"""
emailto = self.to_email
emailfrom = self.from_email
message = MIMEMultipart('alternative')
message['To'] = ", ".join(emailto)
message['From'] = emailfrom
message['Subject'] = obj['title']
body='<a href="'+obj['url']+'"><h1>'+obj['title']+'</h1></a><br>'
body +='<ul><li> km ' + obj['km']+ '</li>'
body +='<li> luogo ' +obj['comune'] + '(' + obj['city']+ ')</li>'
body +='<li>prezzo '+obj['price'] + ' euro</li></ul><br>'
body +='<img src="'+obj['image']+'">'
htmlemailmessage = unicode(body)
raw_message = 'Please visit ' + obj['url']
#Create your plain text message
plaintextemailmessage = unicode(raw_message)
#Add the HTML and plain text messages to the message info list (array)
storeplain = MIMEText(plaintextemailmessage, 'plain')
storehtml = MIMEText(htmlemailmessage, 'html')
message.attach(storeplain)
message.attach(storehtml)
#If you're using an exchange server, port 587 should work as well
deetsurl = smtplib.SMTP("smtp.gmail.com", 587)
deetsuser = self.gmail_user
deetspassword = self.gmail_password
# Connect to the SMTP server and authenticate using TLS encryption
#If using exchange this should work as well as long as TLS is enabled on the server
deetsurl.ehlo()
deetsurl.starttls()
deetsurl.ehlo()
deetsurl.login(deetsuser, deetspassword)
#Send our prepared message to the SMTP server for a request to send
deetsurl.sendmail(emailfrom, emailto, message.as_string())
#All done so print a message saying the email has been sent, and quit the SMTP session
print "Email sent."
deetsurl.quit()
def is_old(self, url):
"""check if a bike has been already crawled"""
with open(self.data_file) as f:
for line in f:
if line.strip() == url:
print '[*] Already crawled'
return True
with open(self.data_file,'a') as f:
f.write(url+ '\n')
return False
scraper = Scraper()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment