Last active
June 4, 2018 07:30
-
-
Save mydreambei-ai/21197d5e8192624513369f2f4236a00f to your computer and use it in GitHub Desktop.
page form identify
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
from __future__ import print_function | |
import lxml.html | |
from collections import OrderedDict | |
import re | |
def after_first_check(func): | |
def newfunc(*args, **kwargs): | |
self = args[0] | |
if not self._pickup_flag: | |
self._first_check() | |
self._pickup_flag = True | |
func_name = func.__name__ | |
field = func_name.split('_')[-1] | |
if self.form_dict.get(field, None) is not None: | |
pass | |
else: | |
if not self._success: | |
return func(*args, **kwargs) | |
return newfunc | |
class FormIdentity(object): | |
passwd_regex = r'(passw(or)?d?|pwd)' | |
user_regex = r'((login|u)id|name|user|account)' | |
email_regex = r'e?mail' | |
user_ph_regex = r'(手机号|邮箱|账号)' | |
passwd_ph_regex = r'密码' | |
submit_ph_regex = r'(submit|登录|陆录)' | |
def __init__(self, root): | |
self.root = self.set_root(root) | |
self.form_dict = {} | |
self.default_input_filters = ['checkbox', 'radio', 'image', 'reset', 'select', 'hidden'] | |
self._pickup_flag = False | |
self._success = False | |
self.result = {} | |
@staticmethod | |
def set_root(data): | |
if isinstance(data, str): | |
root = lxml.html.fromstring(data) | |
elif isinstance(data, bytes): | |
root = lxml.html.fromstring(data.decode()) | |
elif isinstance(data, lxml.etree.ElementBase): | |
root = data | |
else: | |
raise TypeError("data must be str or lxml.etree.ElementBase") | |
return root | |
@property | |
def attributes(self): | |
return self.root.attrib | |
@property | |
def form_path(self): | |
return self.get_item_path(self.root) | |
def get_item_path(self, item): | |
path = item.tag | |
attrib = item.attrib | |
attrib_id = attrib.get('id', None) | |
if attrib_id: | |
path = '{0}#{1}'.format(path, attrib_id) | |
else: | |
attrib_class = attrib.get('class', None) | |
if attrib_class: | |
path = '{0}.{1}'.format(path, self.js_class_path(attrib_class)) | |
return path | |
def find_inputs(self, filters=None): | |
if filters is None: | |
filters = self.default_input_filters | |
o = [] | |
try: | |
for i in self.root.xpath(".//input"): | |
if i.type not in filters: | |
o.append(i) | |
except AttributeError as e: | |
pass | |
return o | |
@staticmethod | |
def js_class_path(strclass): | |
s = strclass.split(' ') | |
return '.'.join(s) | |
def find_buttons(self): | |
return self.root.xpath('.//button') | |
def find_labels(self): | |
return self.root.xpath('.//label') | |
def pickup(self): | |
self._pickup_flag = True | |
self._first_check() | |
self._success_check() | |
self._recheck() | |
self._success_check() | |
def _success_check(self): | |
if not self._success: | |
keys = self.form_dict.keys() | |
if 'submit' in keys and ('username' in keys or 'email' in keys) and "password" in keys: | |
v = self.form_dict.values() | |
for k in keys: | |
self.result[k] = self.js_query_input_or_button_path(self.form_dict.get(k)) | |
if self.result["username"] == self.result["password"]: | |
password_ele = self.form_dict.get("password") | |
if password_ele.type.lower() == 'password': | |
v = self.result["password"] | |
self.result["password"] = "{}[type=password]".format(v) | |
self._success = True | |
else: | |
return | |
self._success = True | |
def _recheck(self): | |
self.input_recheck_username() | |
self.input_recheck_submit() | |
def _first_check(self): | |
for i in self.find_inputs(): | |
self.input_check(i) | |
@after_first_check | |
def input_recheck_username(self): | |
# two input items | |
inputs = self.find_inputs() | |
if len(inputs) == 2: | |
self.form_dict['username'] = inputs[0] | |
self.form_dict['password'] = inputs[-1] | |
self._success_check() | |
@after_first_check | |
def input_recheck_email(self): | |
pass | |
@after_first_check | |
def input_recheck_submit(self): | |
def xxx(i): | |
for _, v in i.attrib.items(): | |
if re.search(self.submit_ph_regex, v.lower()): | |
self.form_dict['submit'] = i | |
return True | |
if re.search(self.submit_ph_regex, i.text_content()): | |
self.form_dict['submit'] = i | |
return True | |
for i in self.find_inputs(): | |
if i.type == 'button': | |
if xxx(i): return | |
for i in self.find_buttons(): | |
if xxx(i): return | |
def input_check(self, item): | |
funcs = OrderedDict({"username": self.input_check_username, "password": self.input_check_password, | |
"email": self.input_check_email, 'submit': self.input_check_submit}) | |
for k, f in funcs.items(): | |
if self.form_dict.get(k,None) is not None: | |
continue | |
if f(item): | |
self.form_dict[k] = item | |
def input_check_name_field(self, regex, name): | |
try: | |
if re.search(regex, name.lower()): | |
return True | |
except AttributeError: | |
# name maybe not | |
pass | |
def input_check_password(self, item): | |
if item.type.lower() == 'password': | |
return True | |
if self.input_check_name_field(self.passwd_regex, item.name): return True | |
# loop every attribute | |
for _, v in item.attrib.items(): | |
if re.search(self.passwd_ph_regex, v.lower()): | |
return True | |
def input_check_username(self, item): | |
if self.input_check_name_field(self.user_regex, item.name): return True | |
for _, v in item.attrib.items(): | |
if re.search(self.user_ph_regex, v.lower()): | |
return True | |
def input_check_email(self, item): | |
if item.type.lower() == 'email': | |
return True | |
if self.input_check_name_field(self.email_regex, item.name): return True | |
def input_check_hidden(self, item): | |
pass | |
def input_check_submit(self, item): | |
return item.type.lower() == 'submit' | |
def tostring(self): | |
return lxml.html.tostring(self.root) | |
def js_query_form(self): | |
return 'document.querySelector("{}")'.format(self.form_path) | |
def js_query_input_path(self, item): | |
input_path = self.get_item_path(item) | |
if input_path == item.tag: | |
if item.name: | |
input_path = 'input[name={}]'.format(item.name) | |
if item.attrib.get('type', ''): | |
input_path = 'input[type={}]'.format(item.attrib.get('type')) | |
return '{} {}'.format(self.form_path, input_path) | |
def js_query_input(self, item): | |
return 'document.querySelector("{0}")'.format(self.js_query_input_path(item)) | |
def js_query_button_path(self, item): | |
return '{0} {1}'.format(self.form_path, self.get_item_path(item)) | |
def js_query_button(self, item): | |
return 'document.querySelector("{0}")'.format(self.js_query_button_path(item)) | |
def js_query_input_or_button_path(self, item): | |
if item.tag == 'input': | |
return self.js_query_input_path(item) | |
if item.tag == 'button': | |
return self.js_query_button_path(item) | |
def js_query_input_or_button(self, item): | |
return 'document.querySelector("{0}")'.format(self.js_query_input_or_button_path(item)) | |
def to_js(self, uv, pv): | |
if self._success: | |
if self.form_dict.get('username', None) is not None: | |
user = self.js_query_input(self.form_dict.get('username')) | |
else: | |
user = self.js_query_input(self.form_dict.get('email')) | |
password = self.js_query_input(self.form_dict.get('password')) | |
submit = self.js_query_input_or_button(self.form_dict.get('submit')) | |
expression = '''var f = {form};var u = {user};var p = {password};var b = {submit}; | |
u.value = {user_value}; | |
p.value = {passwd_value}; | |
b.click(); | |
'''.format(form=self.js_query_form(), user=user, password=password, submit=submit, user_value=uv, | |
passwd_value=pv) | |
return expression | |
return self.form_dict |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment