Skip to content

Instantly share code, notes, and snippets.

@mydreambei-ai
Last active June 4, 2018 07:30
Show Gist options
  • Save mydreambei-ai/21197d5e8192624513369f2f4236a00f to your computer and use it in GitHub Desktop.
Save mydreambei-ai/21197d5e8192624513369f2f4236a00f to your computer and use it in GitHub Desktop.
page form identify
# -*- coding: utf-8 -*-
from __future__ import print_function
import lxml.html
from collections import OrderedDict
import re
def after_first_check(func):
def newfunc(*args, **kwargs):
self = args[0]
if not self._pickup_flag:
self._first_check()
self._pickup_flag = True
func_name = func.__name__
field = func_name.split('_')[-1]
if self.form_dict.get(field, None) is not None:
pass
else:
if not self._success:
return func(*args, **kwargs)
return newfunc
class FormIdentity(object):
passwd_regex = r'(passw(or)?d?|pwd)'
user_regex = r'((login|u)id|name|user|account)'
email_regex = r'e?mail'
user_ph_regex = r'(手机号|邮箱|账号)'
passwd_ph_regex = r'密码'
submit_ph_regex = r'(submit|登录|陆录)'
def __init__(self, root):
self.root = self.set_root(root)
self.form_dict = {}
self.default_input_filters = ['checkbox', 'radio', 'image', 'reset', 'select', 'hidden']
self._pickup_flag = False
self._success = False
self.result = {}
@staticmethod
def set_root(data):
if isinstance(data, str):
root = lxml.html.fromstring(data)
elif isinstance(data, bytes):
root = lxml.html.fromstring(data.decode())
elif isinstance(data, lxml.etree.ElementBase):
root = data
else:
raise TypeError("data must be str or lxml.etree.ElementBase")
return root
@property
def attributes(self):
return self.root.attrib
@property
def form_path(self):
return self.get_item_path(self.root)
def get_item_path(self, item):
path = item.tag
attrib = item.attrib
attrib_id = attrib.get('id', None)
if attrib_id:
path = '{0}#{1}'.format(path, attrib_id)
else:
attrib_class = attrib.get('class', None)
if attrib_class:
path = '{0}.{1}'.format(path, self.js_class_path(attrib_class))
return path
def find_inputs(self, filters=None):
if filters is None:
filters = self.default_input_filters
o = []
try:
for i in self.root.xpath(".//input"):
if i.type not in filters:
o.append(i)
except AttributeError as e:
pass
return o
@staticmethod
def js_class_path(strclass):
s = strclass.split(' ')
return '.'.join(s)
def find_buttons(self):
return self.root.xpath('.//button')
def find_labels(self):
return self.root.xpath('.//label')
def pickup(self):
self._pickup_flag = True
self._first_check()
self._success_check()
self._recheck()
self._success_check()
def _success_check(self):
if not self._success:
keys = self.form_dict.keys()
if 'submit' in keys and ('username' in keys or 'email' in keys) and "password" in keys:
v = self.form_dict.values()
for k in keys:
self.result[k] = self.js_query_input_or_button_path(self.form_dict.get(k))
if self.result["username"] == self.result["password"]:
password_ele = self.form_dict.get("password")
if password_ele.type.lower() == 'password':
v = self.result["password"]
self.result["password"] = "{}[type=password]".format(v)
self._success = True
else:
return
self._success = True
def _recheck(self):
self.input_recheck_username()
self.input_recheck_submit()
def _first_check(self):
for i in self.find_inputs():
self.input_check(i)
@after_first_check
def input_recheck_username(self):
# two input items
inputs = self.find_inputs()
if len(inputs) == 2:
self.form_dict['username'] = inputs[0]
self.form_dict['password'] = inputs[-1]
self._success_check()
@after_first_check
def input_recheck_email(self):
pass
@after_first_check
def input_recheck_submit(self):
def xxx(i):
for _, v in i.attrib.items():
if re.search(self.submit_ph_regex, v.lower()):
self.form_dict['submit'] = i
return True
if re.search(self.submit_ph_regex, i.text_content()):
self.form_dict['submit'] = i
return True
for i in self.find_inputs():
if i.type == 'button':
if xxx(i): return
for i in self.find_buttons():
if xxx(i): return
def input_check(self, item):
funcs = OrderedDict({"username": self.input_check_username, "password": self.input_check_password,
"email": self.input_check_email, 'submit': self.input_check_submit})
for k, f in funcs.items():
if self.form_dict.get(k,None) is not None:
continue
if f(item):
self.form_dict[k] = item
def input_check_name_field(self, regex, name):
try:
if re.search(regex, name.lower()):
return True
except AttributeError:
# name maybe not
pass
def input_check_password(self, item):
if item.type.lower() == 'password':
return True
if self.input_check_name_field(self.passwd_regex, item.name): return True
# loop every attribute
for _, v in item.attrib.items():
if re.search(self.passwd_ph_regex, v.lower()):
return True
def input_check_username(self, item):
if self.input_check_name_field(self.user_regex, item.name): return True
for _, v in item.attrib.items():
if re.search(self.user_ph_regex, v.lower()):
return True
def input_check_email(self, item):
if item.type.lower() == 'email':
return True
if self.input_check_name_field(self.email_regex, item.name): return True
def input_check_hidden(self, item):
pass
def input_check_submit(self, item):
return item.type.lower() == 'submit'
def tostring(self):
return lxml.html.tostring(self.root)
def js_query_form(self):
return 'document.querySelector("{}")'.format(self.form_path)
def js_query_input_path(self, item):
input_path = self.get_item_path(item)
if input_path == item.tag:
if item.name:
input_path = 'input[name={}]'.format(item.name)
if item.attrib.get('type', ''):
input_path = 'input[type={}]'.format(item.attrib.get('type'))
return '{} {}'.format(self.form_path, input_path)
def js_query_input(self, item):
return 'document.querySelector("{0}")'.format(self.js_query_input_path(item))
def js_query_button_path(self, item):
return '{0} {1}'.format(self.form_path, self.get_item_path(item))
def js_query_button(self, item):
return 'document.querySelector("{0}")'.format(self.js_query_button_path(item))
def js_query_input_or_button_path(self, item):
if item.tag == 'input':
return self.js_query_input_path(item)
if item.tag == 'button':
return self.js_query_button_path(item)
def js_query_input_or_button(self, item):
return 'document.querySelector("{0}")'.format(self.js_query_input_or_button_path(item))
def to_js(self, uv, pv):
if self._success:
if self.form_dict.get('username', None) is not None:
user = self.js_query_input(self.form_dict.get('username'))
else:
user = self.js_query_input(self.form_dict.get('email'))
password = self.js_query_input(self.form_dict.get('password'))
submit = self.js_query_input_or_button(self.form_dict.get('submit'))
expression = '''var f = {form};var u = {user};var p = {password};var b = {submit};
u.value = {user_value};
p.value = {passwd_value};
b.click();
'''.format(form=self.js_query_form(), user=user, password=password, submit=submit, user_value=uv,
passwd_value=pv)
return expression
return self.form_dict
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment