mauromarano · August 29, 2015 14:03
diff --git a/spider.py b/spider.py
 #! /usr/bin/env python
 #-*- coding: utf-8 -*-

 # HOW TO:
 # set the top class variables
 # crontab -e and add the command to auto-lounch this command each hour: 1 * * * * * python spider.py

 from bs4 import BeautifulSoup
 import requests
 import re
 import smtplib
 from email.mime.multipart import MIMEMultipart
 from email.mime.text import MIMEText


 class Scraper:


    min_750_url =  'http://www.subito.it/annunci-calabria/vendita/moto-e-scooter/?ccs=750'

    patterns = ['r1','gsxr', 'gsx-r', 'cbr', 'ninja', 'fz1', 'z1000', 'speed']

    data_file = 'database.txt'

    bikes = []

    base_url = 'http://www.subito.it'

    gmail_user = ''
    gmail_password = ''
    to_email = ''
    from_email = 'notification@gmail.com'

    def __init__(self):

        self.soup = self.get_source(self.min_750_url)

        if (self.check_for_data()== False):
            self.create_data_file()

        self.get_bikes()



    def get_source(self, url):
        """get the search html source"""

        r = requests.get(url)
        source = r.text
        return  BeautifulSoup(source)


    def check_for_data(self):
        """check if the data files exists"""
        try:
            with open(self.data_file):
                return True

        except:
            return False

    def create_data_file(self):
        """create an empty data file"""

        try:
            f = open(self.data_file, 'w')
            f.write('')
            f.close()
            return True
        except:
            return False


    def get_bikes(self):
        """get the general information"""

        for bike in self.soup.select('.list > li'):

            title = bike.find('strong').string

            for pattern in self.patterns:

                if(re.search(pattern, title, re.I)):

                    #title
                    url = self.base_url +  bike.find('a')['href']

                    price =  bike.find(class_='price')

                    price =   re.search('(.+)\s€', str(price)).group(1)


                    city = bike.find(class_='city').p.string
                    city = re.search('\w+', str(city)).group()

                    time = bike.find(class_='date')
                    time = re.search('(\d\d:\d\d)', str(time)).group(1)


                    obj = {
                            'url' : url,
                            'price' : price,
                            'title': title,
                            'city' : city,
                            'time' : time
                            }

                    if self.is_old(url) == False:

                        bike = self.get_source(url)

                        obj['body_text'] = bike.find(id='body_txt')

                        infos =  bike.find(class_='annuncio_info').find_all('li')

                        comune = ''
                        km = ''

                        for info in infos:

                            m  = re.search('comune.+>(.+)<',str(info),re.I)

                            if (m is not None):
                                comune = m.group(1)

                            m  = re.search('km.+>(.+)<',str(info),re.I)

                            if (m is not None):
                                km = m.group(1)

                        obj['comune'] = comune
                        obj['km'] = km

                        try:
                            image = bike.find(id='display_image')['style']

                            obj['image'] = re.search('url\(\'(http.+)\'\)',str(image), re.I).group(1)

                        except:
                            obj['image'] = 'No image'
                            print '[*] No image found for: ' + title

                        self.send_email(obj)

    def send_email(self,obj):
        """send the email"""

        emailto = self.to_email

        emailfrom = self.from_email

        message = MIMEMultipart('alternative')
        message['To'] = ", ".join(emailto)
        message['From'] = emailfrom
        message['Subject'] = obj['title']

        body='<a href="'+obj['url']+'"><h1>'+obj['title']+'</h1></a><br>'
        body +='<ul><li> km ' + obj['km']+ '</li>'
        body +='<li> luogo ' +obj['comune'] + '(' + obj['city']+ ')</li>'
        body +='<li>prezzo '+obj['price'] + ' euro</li></ul><br>'
        body +='<img src="'+obj['image']+'">'


        htmlemailmessage = unicode(body)

        raw_message = 'Please visit ' + obj['url']
        #Create your plain text message
        plaintextemailmessage = unicode(raw_message)

        #Add the HTML and plain text messages to the message info list (array)
        storeplain = MIMEText(plaintextemailmessage, 'plain')
        storehtml = MIMEText(htmlemailmessage, 'html')
        message.attach(storeplain)
        message.attach(storehtml)

        #If you're using an exchange server, port 587 should work as well
        deetsurl = smtplib.SMTP("smtp.gmail.com", 587)
        deetsuser = self.gmail_user
        deetspassword = self.gmail_password

        # Connect to the SMTP server and authenticate using TLS encryption
        #If using exchange this should work as well as long as TLS is enabled on the server
        deetsurl.ehlo()
        deetsurl.starttls()
        deetsurl.ehlo()
        deetsurl.login(deetsuser, deetspassword)


        #Send our prepared message to the SMTP server for a request to send
        deetsurl.sendmail(emailfrom, emailto, message.as_string())

        #All done so print a message saying the email has been sent, and quit the SMTP session
        print "Email sent."
        deetsurl.quit()



    def is_old(self, url):
        """check if a bike has been already crawled"""

        with open(self.data_file) as f:

            for line in f:
                if line.strip() == url:
                    print '[*] Already crawled'
                    return True


        with open(self.data_file,'a') as f:
            f.write(url+ '\n')

            return False




 scraper = Scraper()
	#! /usr/bin/env python
	#-- coding: utf-8 --

	# HOW TO:
	# set the top class variables
	# crontab -e and add the command to auto-lounch this command each hour: 1 * * * * * python spider.py

	from bs4 import BeautifulSoup
	import requests
	import re
	import smtplib
	from email.mime.multipart import MIMEMultipart
	from email.mime.text import MIMEText


	class Scraper:


	min_750_url = 'http://www.subito.it/annunci-calabria/vendita/moto-e-scooter/?ccs=750'

	patterns = ['r1','gsxr', 'gsx-r', 'cbr', 'ninja', 'fz1', 'z1000', 'speed']

	data_file = 'database.txt'

	bikes = []

	base_url = 'http://www.subito.it'

	gmail_user = ''
	gmail_password = ''
	to_email = ''
	from_email = 'notification@gmail.com'

	def __init__(self):

	self.soup = self.get_source(self.min_750_url)

	if (self.check_for_data()== False):
	self.create_data_file()

	self.get_bikes()



	def get_source(self, url):
	"""get the search html source"""

	r = requests.get(url)
	source = r.text
	return BeautifulSoup(source)


	def check_for_data(self):
	"""check if the data files exists"""
	try:
	with open(self.data_file):
	return True

	except:
	return False

	def create_data_file(self):
	"""create an empty data file"""

	try:
	f = open(self.data_file, 'w')
	f.write('')
	f.close()
	return True
	except:
	return False


	def get_bikes(self):
	"""get the general information"""

	for bike in self.soup.select('.list > li'):

	title = bike.find('strong').string

	for pattern in self.patterns:

	if(re.search(pattern, title, re.I)):

	#title
	url = self.base_url + bike.find('a')['href']

	price = bike.find(class_='price')

	price = re.search('(.+)\s€', str(price)).group(1)


	city = bike.find(class_='city').p.string
	city = re.search('\w+', str(city)).group()

	time = bike.find(class_='date')
	time = re.search('(\d\d:\d\d)', str(time)).group(1)


	obj = {
	'url' : url,
	'price' : price,
	'title': title,
	'city' : city,
	'time' : time
	}

	if self.is_old(url) == False:

	bike = self.get_source(url)

	obj['body_text'] = bike.find(id='body_txt')

	infos = bike.find(class_='annuncio_info').find_all('li')

	comune = ''
	km = ''

	for info in infos:

	m = re.search('comune.+>(.+)<',str(info),re.I)

	if (m is not None):
	comune = m.group(1)

	m = re.search('km.+>(.+)<',str(info),re.I)

	if (m is not None):
	km = m.group(1)

	obj['comune'] = comune
	obj['km'] = km

	try:
	image = bike.find(id='display_image')['style']

	obj['image'] = re.search('url\(\'(http.+)\'\)',str(image), re.I).group(1)

	except:
	obj['image'] = 'No image'
	print '[*] No image found for: ' + title

	self.send_email(obj)

	def send_email(self,obj):
	"""send the email"""

	emailto = self.to_email

	emailfrom = self.from_email

	message = MIMEMultipart('alternative')
	message['To'] = ", ".join(emailto)
	message['From'] = emailfrom
	message['Subject'] = obj['title']

	body='<a href="'+obj['url']+'"><h1>'+obj['title']+'</h1></a><br>'
	body +='<ul><li> km ' + obj['km']+ '</li>'
	body +='<li> luogo ' +obj['comune'] + '(' + obj['city']+ ')</li>'
	body +='<li>prezzo '+obj['price'] + ' euro</li></ul><br>'
	body +='<img src="'+obj['image']+'">'


	htmlemailmessage = unicode(body)

	raw_message = 'Please visit ' + obj['url']
	#Create your plain text message
	plaintextemailmessage = unicode(raw_message)

	#Add the HTML and plain text messages to the message info list (array)
	storeplain = MIMEText(plaintextemailmessage, 'plain')
	storehtml = MIMEText(htmlemailmessage, 'html')
	message.attach(storeplain)
	message.attach(storehtml)

	#If you're using an exchange server, port 587 should work as well
	deetsurl = smtplib.SMTP("smtp.gmail.com", 587)
	deetsuser = self.gmail_user
	deetspassword = self.gmail_password

	# Connect to the SMTP server and authenticate using TLS encryption
	#If using exchange this should work as well as long as TLS is enabled on the server
	deetsurl.ehlo()
	deetsurl.starttls()
	deetsurl.ehlo()
	deetsurl.login(deetsuser, deetspassword)


	#Send our prepared message to the SMTP server for a request to send
	deetsurl.sendmail(emailfrom, emailto, message.as_string())

	#All done so print a message saying the email has been sent, and quit the SMTP session
	print "Email sent."
	deetsurl.quit()



	def is_old(self, url):
	"""check if a bike has been already crawled"""

	with open(self.data_file) as f:

	for line in f:
	if line.strip() == url:
	print '[*] Already crawled'
	return True


	with open(self.data_file,'a') as f:
	f.write(url+ '\n')

	return False




	scraper = Scraper()
No results found