Skip to content

Instantly share code, notes, and snippets.

@huogerac
Last active August 17, 2017 20:24
Show Gist options
  • Save huogerac/f8b9c55efe1ce5435ec553d3bc43ea3a to your computer and use it in GitHub Desktop.
Save huogerac/f8b9c55efe1ce5435ec553d3bc43ea3a to your computer and use it in GitHub Desktop.
#!/usr/bin/env python
from __future__ import unicode_literals
import os
import codecs
import argparse
import inspect
import sys
import re
import csv
import json
import urllib
import io
from datetime import date, datetime, timedelta
import mechanize
import xlrd
from bs4 import BeautifulSoup
#zap (using pyside - ghost)
from ghost import Ghost
'''
USAGE:
./get_contacts.py -s=zap
This will download a json in the current directory, but you can do this:
./get_contacts.py -s=zap -d=/my/dir
CREATING A NEW SPIDER:
A spider is a subclass of BaseSpider and must implement the method retrieve
retrieve must return a JSON in this format:
[
{ref: unicode_obj, created: strptime, title: unicode,
phone: unicode, customer: unicode, email: unicode, message: unicode}
]
'''
thismodule = sys.modules[__name__]
class BaseSpider(object):
spider_name = 'unknown_spider'
def retrieve(self):
raise NotImplementedError
def save(self, output_dir=None):
data = self.retrieve()
filename = '{0}.json'.format(self.spider_name)
if output_dir is not None:
if not os.path.exists(output_dir):
os.makedirs(output_dir)
output_path = os.path.join(output_dir, filename)
else:
output_path = filename
with codecs.open(output_path, 'w') as buff:
buff.write(data)
'''
Zap utiliza: Pyside e Ghost
http://pyside.readthedocs.io/en/latest/building/linux.html
sudo apt-get install python-bs4 python-pyside
pip install pyside
pip install ghost.py --pre
Ubuntu 14
sudo apt-get install build-essential git cmake libqt4-dev libphonon-dev
python2.7-dev libxml2-dev libxslt1-dev qtmobility-dev libqtwebkit-dev
links:
http://pyside.readthedocs.io/en/latest/building/linux.html
https://github.com/jeanphix/Ghost.py
'''
class ZapSpider(BaseSpider):
spider_name = 'zap'
credentials = ('email', 'passw*rd')
def __init__(self, *args, **kws):
super(ZapSpider, self).__init__(*args, **kws)
self.browser = Ghost(log_level=0)
self.session = self.browser.start(wait_timeout=15)
def login(self):
# Access the login page
self.page, extra_resources = self.session.open('https://extranet.zap.com.br/default.aspx', wait=True)
# Fill the form and evaluate a click on the submit button
try:
self.session.set_field_value('#txtEmail', self.credentials[0], expect_loading=False)
self.session.set_field_value('#txtSenha', self.credentials[1], expect_loading=False)
self.page, extra_resources = self.session.evaluate(
"document.getElementById('lnkOk').click();",
expect_loading=True)
except Exception as e:
# Exception handling
self.handle_exception(e)
def get_messages(self):
# Access the page containing the download link and emulate a click event on it
self.page, extra_resources = self.session.open("https://extranet.zap.com.br/mensagem/mensagens.aspx", wait=True)
try:
self.page, extra_resources = self.session.evaluate(
"document.getElementById('ctl00_ContentPlaceHolder1_imgExportarCSV').click();",
expect_loading=True)
except Exception as e:
# Exception handling
self.handle_exception(e)
# We always use the last resource file, since it is the complete CSV
# This export feature actually registers 4 files with the same name in the RESPONSE,
# being the first three truncated segments of the fourth file, which is complete.
csv_data = unicode(extra_resources[-1].content, encoding='latin1')
number_rows_to_get = 20
messages = []
# Iterate through result file and retrieve messages
for l in csv_data.split('\r\n')[1:number_rows_to_get+1]:
row = l.split(';')[:-1]
date = row[3]
dt_list = date.split('/')
dt_list = map(lambda i: i.replace(' ', ''), dt_list)
day, month = dt_list[:2]
year, hour = dt_list[2].split('-')
date_str = '{0}-{1}-{2} {3}'.format(year, month, day, hour)
data = {'ref': row[0]}
data['created'] = date_str
data['title'] = row[7]
data['phone'] = row[4]
data['customer'] = row[2]
data['email'] = row[5]
data['message'] = row[6]
print("--> {0} {1} {2}".format(data['created'], data['email'], data['phone'])) # noqa
messages.append(data)
return messages
def retrieve(self):
self.login()
messages = self.get_messages()
return json.dumps(messages, encoding='iso-8859-1')
# Custom exception handling
def handle_exception(self, error):
self.session.capture_to('screenshot.png')
with open('ghost_page_dump.log', 'w') as f:
f.write('URL: %s\nError:%s\n'%(str(self.page.url), error))
f.write(self.page.content)
raise Exception("%s\n Screenshot captured to screenshot.png, page content dumped to ghost_page_dump.log \n"%repr(error))
def get_args():
parser = argparse.ArgumentParser()
parser.add_argument('-s', action='store', dest='spider', default=None)
parser.add_argument('-d', action='store', dest='output_dir', default=None)
results = parser.parse_args()
return results
def get_spider(spider_name):
obj = getattr(thismodule, spider_name.title() + 'Spider', None)
if inspect.isclass(obj):
if issubclass(obj, BaseSpider):
return obj
raise ValueError('invalid spider name: ' + spider_name.title() )
if __name__ == '__main__':
args = get_args()
spider_name = args.spider
if spider_name is None:
raise Exception('you must provide a spider name with -s param')
spider_instance = get_spider(spider_name)()
print(' Spider is running...')
spider_instance.save(args.output_dir)
print('done.')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment