Skip to content

Instantly share code, notes, and snippets.

@jvanasco
Created October 18, 2018 23:07
Show Gist options
  • Save jvanasco/be97e65dd83d345b77af822d043d08d0 to your computer and use it in GitHub Desktop.
Save jvanasco/be97e65dd83d345b77af822d043d08d0 to your computer and use it in GitHub Desktop.
wellformed parser - utility for parsing html tags as fast as possible.
def extract_tag_inner(tag):
"""
extracts the inner part of a tag - dropping brackets, tagname, trailing slash
:arg string tag: a html tag of the formats:
<TAG_NAME TAG_ATTRIBUTES>
<TAG_NAME TAG_ATTRIBUTES/>
<TAG_NAME TAG_ATTRIBUTES />
:returns string: the inner part of a tag
"""
# remove the tag brackets
if (tag[0] != '<') or (tag[-1] != '>'):
raise ValueError("invalid tag")
tag = tag[1:-1].strip()
if tag[-1] == '/':
tag = tag[1:-1].strip()
# remove the tag name
try:
tag = tag.split(' ', 1)[1]
except IndexError:
# this happens on a tag without attributes. e.g. "<img/>"
tag = tag
return tag
def tag_inner_parser(txt):
"""
this parser expects/requires WELL FORMED html attributes:
:arg boolean txt: Text to be parsed. This should be cleaned
:returns dict: a dict of values for the attributes.
if an attribute does not have a value (i.e. it is defined by it's mere presence) the value will be `True`
otherwise, values will always be a string
The text should be cleaned beforehand to not contain trailing whitespace or slash
See ``extract_tag_inner`` for a function to extract the inner from a full tag.
"""
kw = {} # kwargs dict
n_k = False # iN_Key
n_v = False # iN_Value
k = None # active Key being parsed
v = None # active Value being parsed
q = None # quote
_qs = ('"', "'", ) # doing this is faring better than a direct comparison
for c in txt: # Char in text
if n_k:
# parsing a KEY
if c == ' ':
# SPACE means we're on an attribute that does not have a value.
# encode this as TRUE
kw[k] = True
# RESET
n_k = False
# no need to set reset other vars, the next loop will handle it
elif c == '=':
# `=` means we're ending the KEY and going to search a value
# this is a SWITCH
n_k = False
n_v = True
v = None
q = None
else:
# assume this char is valid and keep building the KEY
k += c
elif n_v:
# parsing a VALUE
if v is None:
# we just got here!
# this parser requires VALUE attributes to be single or double quoted
if c not in _qs: # cPython is benching faster with this as a var
raise ValueError("invalid quote")
# note the quote character. it will be used to check for escaped quotes
q = c
v = ''
else:
# the value is currently being built
if c == q:
# we hit a quote character. is it escaped?
if v[-1] == '\\':
# this is escaped, treat it as normal
v += c
else:
# FINI!
if q == "'":
# escape quoted chars
v = v.replace("\\'", "'")
# SET
kw[k] = v
# RESET
k = None
v = None
n_k = False
n_v = False
q = None
else:
# normal character, build out key
v += c
else: # not n_k and not n_v:
if c == ' ':
continue
n_k = True
k = c
if k:
# if we're here without clearing a value, then we ended on an attribute that does not have a value
kw[k] = True
return kw
def tag_inner_parser_unknown(txt, cleaned=True):
"""
convenience function
if you have a tag's internals that were not generated via ``extract_tag_inner``, this will try to clean the internals before processing
:param boolean cleaned: Default ``True``. If invoked with ``False``, will perform the following cleaning
if not cleaned:
txt = (txt[:-1] if txt[-1] == '/' else txt).strip()
:returns dict: a dict of values via ``tag_inner_parser```
"""
if not cleaned:
if not txt:
return {}
txt = (txt[:-1] if txt[-1] == '/' else txt).strip()
return tag_inner_parser(txt)
def tag_parser(tag):
"""
convenience function
if you have a tag, this will extract the tag's attributes and then run them through the parser
:param string tag: a html tag
:returns dict: a dict of values via ``tag_inner_parser```
"""
txt = extract_tag_inner(tag)
return tag_inner_parser(txt)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment