Skip to content

Instantly share code, notes, and snippets.

@zevtyardt
Created October 15, 2019 14:26
Show Gist options
  • Save zevtyardt/d54c821ac67d7ca32cf85c353b7b2dab to your computer and use it in GitHub Desktop.
Save zevtyardt/d54c821ac67d7ca32cf85c353b7b2dab to your computer and use it in GitHub Desktop.
Extract All Forms Html into Dict Format
from argparse import Namespace
import sys
import re
import shlex
class _Pattern:
all_forms = re.compile(r"(?si)<\s*form.*?/\s*form\s*>")
header = re.compile(r"(?si)<\s*form(.*?)>")
attrs = re.compile(r"(?si)\s*(.*?)\s*=\s*((?:[\"'](.*?)[\"'])|[\w/]+)")
remove_tag = re.compile(r"^<\s*\w+\s*")
all_form_tags = re.compile(r"(?si)<\s*((?:input|textarea|select|button))\s*(.*?)\s*/?\s*>")
options = re.compile(r"(?si)<\s*option.*?>")
class _Extractor:
def _extract_full_header(self, form):
default = dict(action="/", enctype="application/x-www-form-urlencoded", method="get")
if hasattr(form, "group"):
form = form.group()
head = _Pattern.header.findall(form)
if len(head) > 0:
default.update(_Extractor.extract_attrs(head[0]))
return default
def _find_options_by_data(self, tag, data):
_options = re.compile(r"(?si)<{tag}\s*{data}\s*>\s*(.*?)\s*<\s*/\s*{tag}\s*>".format(
tag=tag, data=data))
opts, selected = _options.search(self.source), ""
if opts is not None:
opts, selected = self._extract_option_value(_Pattern.options.finditer(opts.group()))
return opts or [], selected
def _extract_option_value(self, options):
values, selected = [], ""
for num, opt in enumerate(options):
d = _Extractor.extract_attrs(opt)
value = d.get("value")
if value is not None:
if d.get("disabled") and num == 0:
value = "(*)"
if d.get("selected") and value != "(*)":
selected = value
values.append(value)
return values, selected
@classmethod
def extract_attrs(self, tag):
if hasattr(tag, "group"):
tag = tag.group()
tag = _Pattern.remove_tag.sub("", tag)
_dict = {attr: v2 if v1.startswith("'") or v1.startswith('"') and v2 != "" or v1 == '""' else v1 \
for attr, v1, v2 in _Pattern.attrs.findall(tag)}
if _dict is not {}:
for attr in ("class", "id"):
_attr = _dict.get(attr)
if _attr is not None:
_dict[attr] = shlex.split(_attr)
_so = _dict.get("style")
if _so is not None:
_dict["style"] = {key: value for key, value in \
(re.split(r"\s*:\s*", _) for _ in \
re.split(r"\s*;\s*", _so))}
return _dict
def extract_all_forms(self, source):
self.source = source
return _Pattern.all_forms.finditer(source)
def extract_header(self, form):
f_header = self._extract_full_header(form)
return dict(method=f_header.get("method"), action=f_header.get("action"),
name=f_header.get("name"), enctype=f_header.get("enctype"))
def extract_name_value_tags(self, form):
if hasattr(form, "group"):
form = form.group()
elements, custom = [], {}
for num, tag in enumerate(_Pattern.all_form_tags.finditer(form)):
d = self.tag_elements(tag)
name = d.get("name")
if d.get("type") in ("radio", "checkbox"):
if not custom.get(name):
custom.setdefault(name, ([], num, d.get("type")))
custom[name][0].append(d.get("value"))
else:
n_d = {"name": name, "value": d.get("value"), "type": d.get("type")}
selected = d.get("selected")
if selected and d.get("type").lower() == "select":
n_d.setdefault("selected", selected)
elements.append(n_d)
for key, (val, num, _type) in custom.items():
elements.insert(num, {"name": key, "value": val, "type": _type})
return elements
def tag_elements(self, tags):
default = dict(type="text", value="", name="")
if hasattr(tags, "groups"):
tags = tags.groups()
tag, data = tags
default.update(_Extractor.extract_attrs(data))
if tag.lower() == "select":
value, selected = self._find_options_by_data(tag, data)
default.update({"value": value, "type": "select", "selected": selected})
return default
# example usage
def wrapper(source):
x = _Extractor()
for form in x.extract_all_forms(source):
header = x.extract_header(form)
elements = x.extract_name_value_tags(form)
print (header)
for elem in elements: print (elem)
print ()
import requests
source = requests.get(sys.argv[1]).text # open("form.html").read()
wrapper(source)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment