Created
October 18, 2018 23:07
-
-
Save jvanasco/be97e65dd83d345b77af822d043d08d0 to your computer and use it in GitHub Desktop.
wellformed parser - utility for parsing html tags as fast as possible.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def extract_tag_inner(tag): | |
""" | |
extracts the inner part of a tag - dropping brackets, tagname, trailing slash | |
:arg string tag: a html tag of the formats: | |
<TAG_NAME TAG_ATTRIBUTES> | |
<TAG_NAME TAG_ATTRIBUTES/> | |
<TAG_NAME TAG_ATTRIBUTES /> | |
:returns string: the inner part of a tag | |
""" | |
# remove the tag brackets | |
if (tag[0] != '<') or (tag[-1] != '>'): | |
raise ValueError("invalid tag") | |
tag = tag[1:-1].strip() | |
if tag[-1] == '/': | |
tag = tag[1:-1].strip() | |
# remove the tag name | |
try: | |
tag = tag.split(' ', 1)[1] | |
except IndexError: | |
# this happens on a tag without attributes. e.g. "<img/>" | |
tag = tag | |
return tag | |
def tag_inner_parser(txt): | |
""" | |
this parser expects/requires WELL FORMED html attributes: | |
:arg boolean txt: Text to be parsed. This should be cleaned | |
:returns dict: a dict of values for the attributes. | |
if an attribute does not have a value (i.e. it is defined by it's mere presence) the value will be `True` | |
otherwise, values will always be a string | |
The text should be cleaned beforehand to not contain trailing whitespace or slash | |
See ``extract_tag_inner`` for a function to extract the inner from a full tag. | |
""" | |
kw = {} # kwargs dict | |
n_k = False # iN_Key | |
n_v = False # iN_Value | |
k = None # active Key being parsed | |
v = None # active Value being parsed | |
q = None # quote | |
_qs = ('"', "'", ) # doing this is faring better than a direct comparison | |
for c in txt: # Char in text | |
if n_k: | |
# parsing a KEY | |
if c == ' ': | |
# SPACE means we're on an attribute that does not have a value. | |
# encode this as TRUE | |
kw[k] = True | |
# RESET | |
n_k = False | |
# no need to set reset other vars, the next loop will handle it | |
elif c == '=': | |
# `=` means we're ending the KEY and going to search a value | |
# this is a SWITCH | |
n_k = False | |
n_v = True | |
v = None | |
q = None | |
else: | |
# assume this char is valid and keep building the KEY | |
k += c | |
elif n_v: | |
# parsing a VALUE | |
if v is None: | |
# we just got here! | |
# this parser requires VALUE attributes to be single or double quoted | |
if c not in _qs: # cPython is benching faster with this as a var | |
raise ValueError("invalid quote") | |
# note the quote character. it will be used to check for escaped quotes | |
q = c | |
v = '' | |
else: | |
# the value is currently being built | |
if c == q: | |
# we hit a quote character. is it escaped? | |
if v[-1] == '\\': | |
# this is escaped, treat it as normal | |
v += c | |
else: | |
# FINI! | |
if q == "'": | |
# escape quoted chars | |
v = v.replace("\\'", "'") | |
# SET | |
kw[k] = v | |
# RESET | |
k = None | |
v = None | |
n_k = False | |
n_v = False | |
q = None | |
else: | |
# normal character, build out key | |
v += c | |
else: # not n_k and not n_v: | |
if c == ' ': | |
continue | |
n_k = True | |
k = c | |
if k: | |
# if we're here without clearing a value, then we ended on an attribute that does not have a value | |
kw[k] = True | |
return kw | |
def tag_inner_parser_unknown(txt, cleaned=True): | |
""" | |
convenience function | |
if you have a tag's internals that were not generated via ``extract_tag_inner``, this will try to clean the internals before processing | |
:param boolean cleaned: Default ``True``. If invoked with ``False``, will perform the following cleaning | |
if not cleaned: | |
txt = (txt[:-1] if txt[-1] == '/' else txt).strip() | |
:returns dict: a dict of values via ``tag_inner_parser``` | |
""" | |
if not cleaned: | |
if not txt: | |
return {} | |
txt = (txt[:-1] if txt[-1] == '/' else txt).strip() | |
return tag_inner_parser(txt) | |
def tag_parser(tag): | |
""" | |
convenience function | |
if you have a tag, this will extract the tag's attributes and then run them through the parser | |
:param string tag: a html tag | |
:returns dict: a dict of values via ``tag_inner_parser``` | |
""" | |
txt = extract_tag_inner(tag) | |
return tag_inner_parser(txt) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment