Last active
August 17, 2017 20:24
-
-
Save huogerac/f8b9c55efe1ce5435ec553d3bc43ea3a to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
from __future__ import unicode_literals | |
import os | |
import codecs | |
import argparse | |
import inspect | |
import sys | |
import re | |
import csv | |
import json | |
import urllib | |
import io | |
from datetime import date, datetime, timedelta | |
import mechanize | |
import xlrd | |
from bs4 import BeautifulSoup | |
#zap (using pyside - ghost) | |
from ghost import Ghost | |
''' | |
USAGE: | |
./get_contacts.py -s=zap | |
This will download a json in the current directory, but you can do this: | |
./get_contacts.py -s=zap -d=/my/dir | |
CREATING A NEW SPIDER: | |
A spider is a subclass of BaseSpider and must implement the method retrieve | |
retrieve must return a JSON in this format: | |
[ | |
{ref: unicode_obj, created: strptime, title: unicode, | |
phone: unicode, customer: unicode, email: unicode, message: unicode} | |
] | |
''' | |
thismodule = sys.modules[__name__] | |
class BaseSpider(object): | |
spider_name = 'unknown_spider' | |
def retrieve(self): | |
raise NotImplementedError | |
def save(self, output_dir=None): | |
data = self.retrieve() | |
filename = '{0}.json'.format(self.spider_name) | |
if output_dir is not None: | |
if not os.path.exists(output_dir): | |
os.makedirs(output_dir) | |
output_path = os.path.join(output_dir, filename) | |
else: | |
output_path = filename | |
with codecs.open(output_path, 'w') as buff: | |
buff.write(data) | |
''' | |
Zap utiliza: Pyside e Ghost | |
http://pyside.readthedocs.io/en/latest/building/linux.html | |
sudo apt-get install python-bs4 python-pyside | |
pip install pyside | |
pip install ghost.py --pre | |
Ubuntu 14 | |
sudo apt-get install build-essential git cmake libqt4-dev libphonon-dev | |
python2.7-dev libxml2-dev libxslt1-dev qtmobility-dev libqtwebkit-dev | |
links: | |
http://pyside.readthedocs.io/en/latest/building/linux.html | |
https://github.com/jeanphix/Ghost.py | |
''' | |
class ZapSpider(BaseSpider): | |
spider_name = 'zap' | |
credentials = ('email', 'passw*rd') | |
def __init__(self, *args, **kws): | |
super(ZapSpider, self).__init__(*args, **kws) | |
self.browser = Ghost(log_level=0) | |
self.session = self.browser.start(wait_timeout=15) | |
def login(self): | |
# Access the login page | |
self.page, extra_resources = self.session.open('https://extranet.zap.com.br/default.aspx', wait=True) | |
# Fill the form and evaluate a click on the submit button | |
try: | |
self.session.set_field_value('#txtEmail', self.credentials[0], expect_loading=False) | |
self.session.set_field_value('#txtSenha', self.credentials[1], expect_loading=False) | |
self.page, extra_resources = self.session.evaluate( | |
"document.getElementById('lnkOk').click();", | |
expect_loading=True) | |
except Exception as e: | |
# Exception handling | |
self.handle_exception(e) | |
def get_messages(self): | |
# Access the page containing the download link and emulate a click event on it | |
self.page, extra_resources = self.session.open("https://extranet.zap.com.br/mensagem/mensagens.aspx", wait=True) | |
try: | |
self.page, extra_resources = self.session.evaluate( | |
"document.getElementById('ctl00_ContentPlaceHolder1_imgExportarCSV').click();", | |
expect_loading=True) | |
except Exception as e: | |
# Exception handling | |
self.handle_exception(e) | |
# We always use the last resource file, since it is the complete CSV | |
# This export feature actually registers 4 files with the same name in the RESPONSE, | |
# being the first three truncated segments of the fourth file, which is complete. | |
csv_data = unicode(extra_resources[-1].content, encoding='latin1') | |
number_rows_to_get = 20 | |
messages = [] | |
# Iterate through result file and retrieve messages | |
for l in csv_data.split('\r\n')[1:number_rows_to_get+1]: | |
row = l.split(';')[:-1] | |
date = row[3] | |
dt_list = date.split('/') | |
dt_list = map(lambda i: i.replace(' ', ''), dt_list) | |
day, month = dt_list[:2] | |
year, hour = dt_list[2].split('-') | |
date_str = '{0}-{1}-{2} {3}'.format(year, month, day, hour) | |
data = {'ref': row[0]} | |
data['created'] = date_str | |
data['title'] = row[7] | |
data['phone'] = row[4] | |
data['customer'] = row[2] | |
data['email'] = row[5] | |
data['message'] = row[6] | |
print("--> {0} {1} {2}".format(data['created'], data['email'], data['phone'])) # noqa | |
messages.append(data) | |
return messages | |
def retrieve(self): | |
self.login() | |
messages = self.get_messages() | |
return json.dumps(messages, encoding='iso-8859-1') | |
# Custom exception handling | |
def handle_exception(self, error): | |
self.session.capture_to('screenshot.png') | |
with open('ghost_page_dump.log', 'w') as f: | |
f.write('URL: %s\nError:%s\n'%(str(self.page.url), error)) | |
f.write(self.page.content) | |
raise Exception("%s\n Screenshot captured to screenshot.png, page content dumped to ghost_page_dump.log \n"%repr(error)) | |
def get_args(): | |
parser = argparse.ArgumentParser() | |
parser.add_argument('-s', action='store', dest='spider', default=None) | |
parser.add_argument('-d', action='store', dest='output_dir', default=None) | |
results = parser.parse_args() | |
return results | |
def get_spider(spider_name): | |
obj = getattr(thismodule, spider_name.title() + 'Spider', None) | |
if inspect.isclass(obj): | |
if issubclass(obj, BaseSpider): | |
return obj | |
raise ValueError('invalid spider name: ' + spider_name.title() ) | |
if __name__ == '__main__': | |
args = get_args() | |
spider_name = args.spider | |
if spider_name is None: | |
raise Exception('you must provide a spider name with -s param') | |
spider_instance = get_spider(spider_name)() | |
print(' Spider is running...') | |
spider_instance.save(args.output_dir) | |
print('done.') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment