Created
October 11, 2016 09:03
-
-
Save scq000/d677f0204fc01578df1340a91be1bce1 to your computer and use it in GitHub Desktop.
add encodingUtils to railgun
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# coding: UTF-8 | |
# User: haku | |
# Date: 14-5-22 | |
# Time: 4:01 | |
# Modifier: scq000 | |
__author__ = 'haku-mac' | |
from pattern import Pattern | |
from logger import Logger | |
import re, time | |
import requests | |
import json | |
from bs4 import BeautifulSoup | |
class RailGun: | |
def __init__(self, encodeUtils): | |
self.global_data = {} | |
self.shell_groups = {} | |
self.logger = Logger.getLogger() | |
self.encodeUtils = encodeUtils | |
# set taskdata into me | |
def setTaskData(self, task_data): | |
self.task_data = dict(task_data) | |
# set taskdata into me via a yaml file | |
def setTask(self, tfile, ext=None): | |
assert isinstance(tfile, file), "taskfile should be an instance file, get" + str(type(tfile)) | |
if not ext: | |
ext = tfile.name.split(".")[-1] | |
task_data = json.load(tfile) | |
assert task_data, "Task Data is Empty" | |
self.task_data = dict(task_data) | |
# set some running global | |
def setGlobalData(self, key, value): | |
self.global_data[key] = value | |
# do work | |
def fire(self): | |
self.__parserShells(self.task_data) | |
return self.shell_groups | |
# get parsed shells | |
def getShells(self, group_name='default'): | |
return self.shell_groups.get(group_name) | |
def __parserShells(self, task_entry): | |
""" | |
:param task_entry: | |
:return: | |
""" | |
if (isinstance(task_entry, unicode)): | |
return | |
# do current action | |
actionname = task_entry["action"].strip() | |
if None != task_entry.get('shellid'): | |
self.logger.info("info current shell [" + task_entry.get('shellgroup') + ":" + \ | |
str(task_entry.get('shellid')) + "]") | |
actionMap = { | |
'main': "__main" | |
, 'shell': '__createShell' | |
, 'faketask': '__faketask' | |
, 'fetcher': '__fetch' | |
, 'parser': '__parser' | |
} | |
if actionname in actionMap.keys(): | |
worker = getattr(self | |
, '_RailGun{}'.format(actionMap[actionname]) | |
) | |
if callable(worker): | |
task_entry = worker(task_entry) | |
if (None == task_entry.get('subaction')): | |
return | |
for subtask in task_entry['subaction']: | |
# if entry is not fakedshell and entry has datas then copy to subtask | |
if (subtask['action'] != 'faketask' and task_entry.get('datas') != None): | |
subtask['datas'] = task_entry.get('datas') | |
# ignore datas field | |
if 'datas' == str(subtask): | |
continue | |
# passed to subtask | |
if None != task_entry.get('shellgroup'): | |
subtask['shellgroup'] = task_entry.get('shellgroup') | |
if None != task_entry.get('shellid'): | |
subtask['shellid'] = task_entry.get('shellid') | |
self.__parserShells(subtask) | |
return | |
def __main(self, task_entry): | |
self.logger.info(task_entry['name'] + " is now running") | |
return task_entry | |
# using webkit to fetch url | |
def __fetch_webkit(self, task_entry): | |
p = Pattern(task_entry, self.__getCurrentShell(task_entry), self.global_data) | |
import cwebbrowser | |
task_entry['datas'] = [] | |
urls = p.convertPattern('url') | |
timeout = task_entry.get('timeout', 120) | |
delay = task_entry.get('delay', 0) | |
for url in urls: | |
self.logger.info("fetching " + url) | |
data = "" | |
if not url: | |
# do not fetch null url | |
continue | |
browser = cwebbrowser.CWebBrowser() | |
browser.setHeaders(task_entry.get('headers', [])) | |
#browser.show(); | |
try: | |
browser.load(url=url, load_timeout=timeout, delay=delay) | |
except cwebbrowser.Timeout: | |
self.logger.error("fetch " + url + " timeout ") | |
except Exception, exception: | |
self.logger.error("fetch " + url + " error ") | |
print "Exception message:", exception | |
else: | |
html = browser.html() | |
if html: | |
html = html.encode('utf-8') | |
data = html | |
else: | |
self.logger.error("fetch " + url + " failed with no response") | |
task_entry['datas'].append(data) | |
browser.close() | |
return task_entry | |
def __fetch_requests(self, task_entry): | |
p = Pattern(task_entry, self.__getCurrentShell(task_entry), self.global_data) | |
timeout = task_entry.get('timeout', 120) | |
urls = p.convertPattern('url') | |
s = requests.session() | |
headers = task_entry.get('headers', []) | |
task_entry['datas'] = [] | |
if not urls: | |
return task_entry | |
for url in urls: | |
self.logger.info("fetching " + url) | |
data = "" | |
if not url: | |
# do not fetch null url | |
continue | |
try: | |
response = s.get(url, timeout=timeout, headers=headers) | |
if 200 != response.status_code: | |
self.logger.error("fetch " + url + " failed with code " + (str)(response.status_code)) | |
if self.encodeUtils: | |
data = str(self.encodeUtils.start(url)) | |
self.logger.error('hello' + data) | |
else: | |
data = response.text | |
except: | |
self.logger.error("fetch " + url + " failed in sockets") | |
task_entry['datas'].append(data) | |
return task_entry | |
# fetch something | |
def __fetch(self, task_entry): | |
if task_entry.get("webkit", False): | |
return self.__fetch_webkit(task_entry) | |
return self.__fetch_requests(task_entry) | |
def __faketask(self, task_entry): | |
return task_entry | |
# parse with soup | |
def __parser(self, task_entry): | |
rule = task_entry['rule'].strip() | |
self.logger.info("parsing with rule " + rule) | |
strip = task_entry.get('strip') | |
datas = task_entry.get('datas') | |
pos = task_entry.get('pos') | |
attr = task_entry.get('attr') | |
parsed_datas = [] | |
for data in datas: | |
self.logger.debug("parse from raw " + str(data)) | |
soup = BeautifulSoup(data) | |
parsed_data_sps = soup.select(rule) | |
# set pos | |
if (None != pos): | |
if pos > len(parsed_data_sps) - 1: | |
parsed_data_sps = [] | |
else: | |
parsed_data_sps = [parsed_data_sps[pos]] | |
for tag in parsed_data_sps: | |
tag = unicode(tag) | |
if (None != attr): | |
attr_data = BeautifulSoup(tag.encode("utf8")) | |
tag = attr_data.contents[0].get(attr) | |
if strip == 'true': | |
dr = re.compile(r'<!--.*-->') | |
tag = dr.sub('', tag) | |
dr = re.compile(r'<.*?>') | |
tag = dr.sub('', tag) | |
dr = re.compile(r'[\r\n]') | |
tag = dr.sub('', tag) | |
parsed_datas.append(tag) | |
self.logger.info("after parsing " + str(len(parsed_datas))) | |
# set data to shell | |
current_shell = self.__getCurrentShell(task_entry) | |
if current_shell != None and task_entry.get('setField') != None and len(parsed_datas) > 0: | |
fieldname = task_entry.get('setField') | |
self.logger.debug("set" + fieldname + "as" + str(parsed_datas)); | |
current_shell[fieldname] = parsed_datas | |
task_entry['datas'] = parsed_datas | |
return task_entry | |
def __createShell(self, task_entry): | |
datas = task_entry.get('datas') | |
# every shell has only one data | |
subacts = [] | |
self.logger.info(str(len(datas)) + " shells created") | |
shellgroup = task_entry.get('group', 'default') | |
shellid = 0 | |
self.shell_groups[shellgroup] = {} | |
for data in datas: | |
shellid += 1 | |
# init shell | |
self.shell_groups[shellgroup][shellid] = {} | |
# task entry splited into pieces | |
# sub actions nums = now sub nums * shell num | |
subact = { | |
"action": "faketask", | |
"shellgroup": shellgroup, | |
"shellid": shellid, | |
"datas": [data], | |
"subaction": task_entry["subaction"] | |
} | |
subacts.append(subact) | |
task_entry["subaction"] = subacts | |
return task_entry | |
def __getCurrentShell(self, task_entry): | |
if (None == task_entry.get('shellgroup')): | |
return None | |
shellgroup = task_entry['shellgroup'] | |
if None == self.shell_groups.get(shellgroup): | |
return None | |
shellid = task_entry['shellid'] | |
shell = self.shell_groups[shellgroup][shellid] | |
return shell |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment