Created
October 15, 2019 14:26
-
-
Save zevtyardt/d54c821ac67d7ca32cf85c353b7b2dab to your computer and use it in GitHub Desktop.
Extract All Forms Html into Dict Format
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from argparse import Namespace | |
import sys | |
import re | |
import shlex | |
class _Pattern: | |
all_forms = re.compile(r"(?si)<\s*form.*?/\s*form\s*>") | |
header = re.compile(r"(?si)<\s*form(.*?)>") | |
attrs = re.compile(r"(?si)\s*(.*?)\s*=\s*((?:[\"'](.*?)[\"'])|[\w/]+)") | |
remove_tag = re.compile(r"^<\s*\w+\s*") | |
all_form_tags = re.compile(r"(?si)<\s*((?:input|textarea|select|button))\s*(.*?)\s*/?\s*>") | |
options = re.compile(r"(?si)<\s*option.*?>") | |
class _Extractor: | |
def _extract_full_header(self, form): | |
default = dict(action="/", enctype="application/x-www-form-urlencoded", method="get") | |
if hasattr(form, "group"): | |
form = form.group() | |
head = _Pattern.header.findall(form) | |
if len(head) > 0: | |
default.update(_Extractor.extract_attrs(head[0])) | |
return default | |
def _find_options_by_data(self, tag, data): | |
_options = re.compile(r"(?si)<{tag}\s*{data}\s*>\s*(.*?)\s*<\s*/\s*{tag}\s*>".format( | |
tag=tag, data=data)) | |
opts, selected = _options.search(self.source), "" | |
if opts is not None: | |
opts, selected = self._extract_option_value(_Pattern.options.finditer(opts.group())) | |
return opts or [], selected | |
def _extract_option_value(self, options): | |
values, selected = [], "" | |
for num, opt in enumerate(options): | |
d = _Extractor.extract_attrs(opt) | |
value = d.get("value") | |
if value is not None: | |
if d.get("disabled") and num == 0: | |
value = "(*)" | |
if d.get("selected") and value != "(*)": | |
selected = value | |
values.append(value) | |
return values, selected | |
@classmethod | |
def extract_attrs(self, tag): | |
if hasattr(tag, "group"): | |
tag = tag.group() | |
tag = _Pattern.remove_tag.sub("", tag) | |
_dict = {attr: v2 if v1.startswith("'") or v1.startswith('"') and v2 != "" or v1 == '""' else v1 \ | |
for attr, v1, v2 in _Pattern.attrs.findall(tag)} | |
if _dict is not {}: | |
for attr in ("class", "id"): | |
_attr = _dict.get(attr) | |
if _attr is not None: | |
_dict[attr] = shlex.split(_attr) | |
_so = _dict.get("style") | |
if _so is not None: | |
_dict["style"] = {key: value for key, value in \ | |
(re.split(r"\s*:\s*", _) for _ in \ | |
re.split(r"\s*;\s*", _so))} | |
return _dict | |
def extract_all_forms(self, source): | |
self.source = source | |
return _Pattern.all_forms.finditer(source) | |
def extract_header(self, form): | |
f_header = self._extract_full_header(form) | |
return dict(method=f_header.get("method"), action=f_header.get("action"), | |
name=f_header.get("name"), enctype=f_header.get("enctype")) | |
def extract_name_value_tags(self, form): | |
if hasattr(form, "group"): | |
form = form.group() | |
elements, custom = [], {} | |
for num, tag in enumerate(_Pattern.all_form_tags.finditer(form)): | |
d = self.tag_elements(tag) | |
name = d.get("name") | |
if d.get("type") in ("radio", "checkbox"): | |
if not custom.get(name): | |
custom.setdefault(name, ([], num, d.get("type"))) | |
custom[name][0].append(d.get("value")) | |
else: | |
n_d = {"name": name, "value": d.get("value"), "type": d.get("type")} | |
selected = d.get("selected") | |
if selected and d.get("type").lower() == "select": | |
n_d.setdefault("selected", selected) | |
elements.append(n_d) | |
for key, (val, num, _type) in custom.items(): | |
elements.insert(num, {"name": key, "value": val, "type": _type}) | |
return elements | |
def tag_elements(self, tags): | |
default = dict(type="text", value="", name="") | |
if hasattr(tags, "groups"): | |
tags = tags.groups() | |
tag, data = tags | |
default.update(_Extractor.extract_attrs(data)) | |
if tag.lower() == "select": | |
value, selected = self._find_options_by_data(tag, data) | |
default.update({"value": value, "type": "select", "selected": selected}) | |
return default | |
# example usage | |
def wrapper(source): | |
x = _Extractor() | |
for form in x.extract_all_forms(source): | |
header = x.extract_header(form) | |
elements = x.extract_name_value_tags(form) | |
print (header) | |
for elem in elements: print (elem) | |
print () | |
import requests | |
source = requests.get(sys.argv[1]).text # open("form.html").read() | |
wrapper(source) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment