huogerac · August 17, 2017 20:24
diff --git a/get_contacts.py b/get_contacts.py
 #!/usr/bin/env python
 from __future__ import unicode_literals

 import os
 import codecs
 import argparse
 import inspect
 import sys
 import re
 import csv
 import json
 import urllib
 import io

 from datetime import date, datetime, timedelta

 import mechanize
 import xlrd

 from bs4 import BeautifulSoup

 #zap (using pyside - ghost)
 from ghost import Ghost



 '''
    USAGE:
    ./get_contacts.py -s=zap
    This will download a json in the current directory, but you can do this:
    ./get_contacts.py -s=zap -d=/my/dir

    CREATING A NEW SPIDER:
    A spider is a subclass of BaseSpider and must implement the method retrieve
    retrieve must return a JSON in this format:
    [
        {ref: unicode_obj, created: strptime, title: unicode,
         phone: unicode, customer: unicode, email: unicode, message: unicode}
    ]
 '''

 thismodule = sys.modules[__name__]


 class BaseSpider(object):

    spider_name = 'unknown_spider'

    def retrieve(self):
        raise NotImplementedError

    def save(self, output_dir=None):
        data = self.retrieve()

        filename = '{0}.json'.format(self.spider_name)

        if output_dir is not None:
            if not os.path.exists(output_dir):
                os.makedirs(output_dir)
            output_path = os.path.join(output_dir, filename)
        else:
            output_path = filename

        with codecs.open(output_path, 'w') as buff:
            buff.write(data)


 '''
 Zap utiliza: Pyside e Ghost

 http://pyside.readthedocs.io/en/latest/building/linux.html

 sudo apt-get install python-bs4 python-pyside

 pip install pyside
 pip install ghost.py --pre

 Ubuntu 14
 sudo apt-get install build-essential git cmake libqt4-dev libphonon-dev 
     python2.7-dev libxml2-dev libxslt1-dev qtmobility-dev libqtwebkit-dev

 links:
 http://pyside.readthedocs.io/en/latest/building/linux.html
 https://github.com/jeanphix/Ghost.py

 '''
 class ZapSpider(BaseSpider):

    spider_name = 'zap'
    credentials = ('email', 'passw*rd')

    def __init__(self, *args, **kws):
        super(ZapSpider, self).__init__(*args, **kws)
        self.browser = Ghost(log_level=0)
        self.session = self.browser.start(wait_timeout=15)

    def login(self):
        # Access the login page
        self.page, extra_resources = self.session.open('https://extranet.zap.com.br/default.aspx', wait=True)
        # Fill the form and evaluate a click on the submit button
        try:
            self.session.set_field_value('#txtEmail', self.credentials[0], expect_loading=False)
            self.session.set_field_value('#txtSenha', self.credentials[1], expect_loading=False)
            self.page, extra_resources = self.session.evaluate(
                "document.getElementById('lnkOk').click();",
                expect_loading=True)
        except Exception as e:
            # Exception handling
            self.handle_exception(e)

    def get_messages(self):
        # Access the page containing the download link and emulate a click event on it

        self.page, extra_resources = self.session.open("https://extranet.zap.com.br/mensagem/mensagens.aspx", wait=True)
        try:
            self.page, extra_resources = self.session.evaluate(
                "document.getElementById('ctl00_ContentPlaceHolder1_imgExportarCSV').click();",
                expect_loading=True)
        except Exception as e:
            # Exception handling
            self.handle_exception(e)
        # We always use the last resource file, since it is the complete CSV
        # This export feature actually registers 4 files with the same name in the RESPONSE,
        # being the first three truncated segments of the fourth file, which is complete.
        csv_data = unicode(extra_resources[-1].content, encoding='latin1')

        number_rows_to_get = 20
        messages = []

        # Iterate through result file and retrieve messages
        for l in csv_data.split('\r\n')[1:number_rows_to_get+1]:
            row = l.split(';')[:-1]
            date = row[3]
            dt_list = date.split('/')
            dt_list = map(lambda i: i.replace(' ', ''), dt_list)
            day, month = dt_list[:2]
            year, hour = dt_list[2].split('-')
            date_str = '{0}-{1}-{2} {3}'.format(year, month, day, hour)

            data = {'ref': row[0]}
            data['created'] = date_str
            data['title'] = row[7]
            data['phone'] = row[4]
            data['customer'] = row[2]
            data['email'] = row[5]
            data['message'] = row[6]
            print("--> {0} {1} {2}".format(data['created'], data['email'], data['phone']))  # noqa
            messages.append(data)

        return messages

    def retrieve(self):
        self.login()
        messages = self.get_messages()
        return json.dumps(messages, encoding='iso-8859-1')

    # Custom exception handling
    def handle_exception(self, error):
        self.session.capture_to('screenshot.png')
        with open('ghost_page_dump.log', 'w') as f:
            f.write('URL: %s\nError:%s\n'%(str(self.page.url), error))
            f.write(self.page.content)
        raise Exception("%s\n Screenshot captured to screenshot.png, page content dumped to ghost_page_dump.log \n"%repr(error))





 def get_args():
    parser = argparse.ArgumentParser()
    parser.add_argument('-s', action='store', dest='spider', default=None)
    parser.add_argument('-d', action='store', dest='output_dir', default=None)
    results = parser.parse_args()
    return results


 def get_spider(spider_name):
    obj = getattr(thismodule, spider_name.title() + 'Spider', None)
    if inspect.isclass(obj):
        if issubclass(obj, BaseSpider):
            return obj
    raise ValueError('invalid spider name: ' + spider_name.title() )


 if __name__ == '__main__':
    args = get_args()
    spider_name = args.spider
    if spider_name is None:
        raise Exception('you must provide a spider name with -s param')
    spider_instance = get_spider(spider_name)()
    print('  Spider is running...')
    spider_instance.save(args.output_dir)
    print('done.')
	#!/usr/bin/env python
	from __future__ import unicode_literals

	import os
	import codecs
	import argparse
	import inspect
	import sys
	import re
	import csv
	import json
	import urllib
	import io

	from datetime import date, datetime, timedelta

	import mechanize
	import xlrd

	from bs4 import BeautifulSoup

	#zap (using pyside - ghost)
	from ghost import Ghost



	'''
	USAGE:
	./get_contacts.py -s=zap
	This will download a json in the current directory, but you can do this:
	./get_contacts.py -s=zap -d=/my/dir

	CREATING A NEW SPIDER:
	A spider is a subclass of BaseSpider and must implement the method retrieve
	retrieve must return a JSON in this format:
	[
	{ref: unicode_obj, created: strptime, title: unicode,
	phone: unicode, customer: unicode, email: unicode, message: unicode}
	]
	'''

	thismodule = sys.modules[__name__]


	class BaseSpider(object):

	spider_name = 'unknown_spider'

	def retrieve(self):
	raise NotImplementedError

	def save(self, output_dir=None):
	data = self.retrieve()

	filename = '{0}.json'.format(self.spider_name)

	if output_dir is not None:
	if not os.path.exists(output_dir):
	os.makedirs(output_dir)
	output_path = os.path.join(output_dir, filename)
	else:
	output_path = filename

	with codecs.open(output_path, 'w') as buff:
	buff.write(data)


	'''
	Zap utiliza: Pyside e Ghost

	http://pyside.readthedocs.io/en/latest/building/linux.html

	sudo apt-get install python-bs4 python-pyside

	pip install pyside
	pip install ghost.py --pre

	Ubuntu 14
	sudo apt-get install build-essential git cmake libqt4-dev libphonon-dev
	python2.7-dev libxml2-dev libxslt1-dev qtmobility-dev libqtwebkit-dev

	links:
	http://pyside.readthedocs.io/en/latest/building/linux.html
	https://github.com/jeanphix/Ghost.py

	'''
	class ZapSpider(BaseSpider):

	spider_name = 'zap'
	credentials = ('email', 'passw*rd')

	def __init__(self, args, *kws):
	super(ZapSpider, self).__init__(args, *kws)
	self.browser = Ghost(log_level=0)
	self.session = self.browser.start(wait_timeout=15)

	def login(self):
	# Access the login page
	self.page, extra_resources = self.session.open('https://extranet.zap.com.br/default.aspx', wait=True)
	# Fill the form and evaluate a click on the submit button
	try:
	self.session.set_field_value('#txtEmail', self.credentials[0], expect_loading=False)
	self.session.set_field_value('#txtSenha', self.credentials[1], expect_loading=False)
	self.page, extra_resources = self.session.evaluate(
	"document.getElementById('lnkOk').click();",
	expect_loading=True)
	except Exception as e:
	# Exception handling
	self.handle_exception(e)

	def get_messages(self):
	# Access the page containing the download link and emulate a click event on it

	self.page, extra_resources = self.session.open("https://extranet.zap.com.br/mensagem/mensagens.aspx", wait=True)
	try:
	self.page, extra_resources = self.session.evaluate(
	"document.getElementById('ctl00_ContentPlaceHolder1_imgExportarCSV').click();",
	expect_loading=True)
	except Exception as e:
	# Exception handling
	self.handle_exception(e)
	# We always use the last resource file, since it is the complete CSV
	# This export feature actually registers 4 files with the same name in the RESPONSE,
	# being the first three truncated segments of the fourth file, which is complete.
	csv_data = unicode(extra_resources[-1].content, encoding='latin1')

	number_rows_to_get = 20
	messages = []

	# Iterate through result file and retrieve messages
	for l in csv_data.split('\r\n')[1:number_rows_to_get+1]:
	row = l.split(';')[:-1]
	date = row[3]
	dt_list = date.split('/')
	dt_list = map(lambda i: i.replace(' ', ''), dt_list)
	day, month = dt_list[:2]
	year, hour = dt_list[2].split('-')
	date_str = '{0}-{1}-{2} {3}'.format(year, month, day, hour)

	data = {'ref': row[0]}
	data['created'] = date_str
	data['title'] = row[7]
	data['phone'] = row[4]
	data['customer'] = row[2]
	data['email'] = row[5]
	data['message'] = row[6]
	print("--> {0} {1} {2}".format(data['created'], data['email'], data['phone'])) # noqa
	messages.append(data)

	return messages

	def retrieve(self):
	self.login()
	messages = self.get_messages()
	return json.dumps(messages, encoding='iso-8859-1')

	# Custom exception handling
	def handle_exception(self, error):
	self.session.capture_to('screenshot.png')
	with open('ghost_page_dump.log', 'w') as f:
	f.write('URL: %s\nError:%s\n'%(str(self.page.url), error))
	f.write(self.page.content)
	raise Exception("%s\n Screenshot captured to screenshot.png, page content dumped to ghost_page_dump.log \n"%repr(error))





	def get_args():
	parser = argparse.ArgumentParser()
	parser.add_argument('-s', action='store', dest='spider', default=None)
	parser.add_argument('-d', action='store', dest='output_dir', default=None)
	results = parser.parse_args()
	return results


	def get_spider(spider_name):
	obj = getattr(thismodule, spider_name.title() + 'Spider', None)
	if inspect.isclass(obj):
	if issubclass(obj, BaseSpider):
	return obj
	raise ValueError('invalid spider name: ' + spider_name.title() )


	if __name__ == '__main__':
	args = get_args()
	spider_name = args.spider
	if spider_name is None:
	raise Exception('you must provide a spider name with -s param')
	spider_instance = get_spider(spider_name)()
	print(' Spider is running...')
	spider_instance.save(args.output_dir)
	print('done.')