Skip to content

Instantly share code, notes, and snippets.

@paul-english
Created April 19, 2019 20:48
Show Gist options
  • Save paul-english/85e703d072ddd8fd6300945fb6b1f59a to your computer and use it in GitHub Desktop.
Save paul-english/85e703d072ddd8fd6300945fb6b1f59a to your computer and use it in GitHub Desktop.
We have some database columns that are a dumping ground for Elixir maps, elixir keyword lists, elixir structs, html, xml, & JSON. This parses them all.
from lark import Lark
l = Lark('''
?start: elixir_item | json_value | xml | node
///////////////////////
// JSON
?json_value: json_object
| json_array
| ESCAPED_STRING
| SIGNED_NUMBER -> number
| "true" -> true
| "false" -> false
| "null" -> null
json_array : "[" [json_value ("," json_value)*] "]"
json_object : "{" [json_pair ("," json_pair)*] "}"
json_pair : ESCAPED_STRING ":" json_value
///////////////////////
// Elixir
?elixir_item: keyword_list | map | ESCAPED_STRING | module | atom | tuple | nil | struct | SIGNED_NUMBER | unclosed_p
keyword_list: "[" [either_item ("," either_item)*] "]"
?either_item: elixir_item | colon_pair
map: "%" "{" [pair ("," pair)*] "}"
?pair: colon_pair | arrow_pair
colon_pair: CNAME ":" elixir_item
string_colon_pair: ESCAPED_STRING ":" elixir_item
arrow_pair: ESCAPED_STRING "=>" elixir_item
struct: "%" module "{" [pair ("," pair)*] "}"
nil: "nil"
atom: ":" CNAME
tuple: "{" [elixir_item ("," elixir_item)*] "}"
module: CNAME ("." CNAME)*
///////////////////////
// XML
?xml: "<?xml" attr* "?>" node*
?node: closed_node | self_closed_node
closed_node: "<" CNAME ">" xml_body "</" CNAME ">"
self_closed_node: "<" CNAME "/>"
xml_body: node* | any
any: /[^<]+/
attr: CNAME "=" ESCAPED_STRING
///////////////////////
// HTML
// keeping limited xml & html processing means we don't have to
// special case every html/xml before hitting the parser
html: html_node
unclosed_p: "<p>" // couldn't ever get this working, just doing a starts with check to handle xml & html with a good parser
html_node: html_closed_node | self_closed_node | unclosed_p
html_closed_node: "<" CNAME ">" html_body* "</" CNAME ">"
html_body: html_node | any
///////////////////////
%import common.CNAME
%import common.ESCAPED_STRING
%import common.SIGNED_NUMBER
%import common.WS
%ignore WS
''')
from lark.lexer import Token
token_handlers = {
'ESCAPED_STRING': eval,
'CNAME': str,
'SIGNED_NUMBER': float,
}
def token_to_py(token):
if token.type not in token_handlers:
raise Exception("unhandled token: %s (%s)" % (token.type, token))
return token_handlers[token.type](token.value)
def object_handler(tree):
obj = {}
for c in tree.children:
assert 'pair' in c.data, "No pair in object... %s" % (tree)
assert len(c.children) == 2, "Problematic obj (len %s): %s" % (len(c.children), c.data)
k = tree_to_py(c.children[0])
v = tree_to_py(c.children[1])
obj[k] = v
return obj
def kw_list_object_handler(tree):
try:
all_pairs = all([
'pair' in c.data
for c in tree.children
])
except Exception as e:
all_pairs = False
if all_pairs:
return object_handler(tree)
else:
# it's probably just a regular list
return array_handler(tree)
def number_handler(num):
assert len(num.children) == 1
assert num.children[0].type == "SIGNED_NUMBER"
return float(num.children[0].value)
def array_handler(tree):
obj = []
for c in tree.children:
obj.append(tree_to_py(c))
return obj
def atom_handler(tree):
assert len(tree.children) == 1
return tree_to_py(tree.children[0])
def module_handler(tree):
return ".".join([
tree_to_py(c)
for c in tree.children
])
def struct_handler(tree):
obj = {
'struct_name': module_handler(tree.children[0])
}
for c in tree.children[1:]:
assert 'pair' in c.data, "No pair in object... %s" % (tree)
assert len(c.children) == 2, "Problematic obj (len %s): %s" % (len(c.children), c.data)
k = tree_to_py(c.children[0])
v = tree_to_py(c.children[1])
obj[k] = v
return obj
def closed_node_handler(tree):
k = tree_to_py(tree.children[0])
v = [
tree_to_py(c)
for c in tree.children[1:-1]
]
return {
k: v
}
def any_handler(tree):
assert len(tree.children) == 1
return tree.children[0].value
tree_handlers = {
'json_object': object_handler,
'number': number_handler,
'null': lambda t: None,
'false': lambda t: False,
'true': lambda t: True,
'json_array': array_handler,
'keyword_list': kw_list_object_handler,
'tuple': array_handler,
'atom': atom_handler,
'map': object_handler,
'module': module_handler,
'struct': struct_handler,
'closed_node': closed_node_handler,
'xml_body': array_handler,
'any': any_handler,
}
def tree_to_py(tree):
if type(tree) == Token:
return token_to_py(tree)
if tree.data not in tree_handlers:
raise Exception("unhandled tree: %s (%s)" % (tree.data, tree.children))
return tree_handlers[tree.data](tree)
def parse_elixir_bag(v):
if v.startswith('<?xml'):
return xmltodict.parse(v)
elif v.lower().startswith('<html') or v.lower().startswith('<!doctype'):
p = BeautifulSoup(v) # fixes html errors
return xmltodict.parse(str(p))
tree = l.parse(v)
try:
return tree_to_py(tree)
except Exception as e:
print('---E', v)
raise e
# Example usage:
dataframe.raw_column.apply(parse_elixir_bag)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment