Created
April 19, 2019 20:48
-
-
Save paul-english/85e703d072ddd8fd6300945fb6b1f59a to your computer and use it in GitHub Desktop.
We have some database columns that are a dumping ground for Elixir maps, elixir keyword lists, elixir structs, html, xml, & JSON. This parses them all.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| from lark import Lark | |
| l = Lark(''' | |
| ?start: elixir_item | json_value | xml | node | |
| /////////////////////// | |
| // JSON | |
| ?json_value: json_object | |
| | json_array | |
| | ESCAPED_STRING | |
| | SIGNED_NUMBER -> number | |
| | "true" -> true | |
| | "false" -> false | |
| | "null" -> null | |
| json_array : "[" [json_value ("," json_value)*] "]" | |
| json_object : "{" [json_pair ("," json_pair)*] "}" | |
| json_pair : ESCAPED_STRING ":" json_value | |
| /////////////////////// | |
| // Elixir | |
| ?elixir_item: keyword_list | map | ESCAPED_STRING | module | atom | tuple | nil | struct | SIGNED_NUMBER | unclosed_p | |
| keyword_list: "[" [either_item ("," either_item)*] "]" | |
| ?either_item: elixir_item | colon_pair | |
| map: "%" "{" [pair ("," pair)*] "}" | |
| ?pair: colon_pair | arrow_pair | |
| colon_pair: CNAME ":" elixir_item | |
| string_colon_pair: ESCAPED_STRING ":" elixir_item | |
| arrow_pair: ESCAPED_STRING "=>" elixir_item | |
| struct: "%" module "{" [pair ("," pair)*] "}" | |
| nil: "nil" | |
| atom: ":" CNAME | |
| tuple: "{" [elixir_item ("," elixir_item)*] "}" | |
| module: CNAME ("." CNAME)* | |
| /////////////////////// | |
| // XML | |
| ?xml: "<?xml" attr* "?>" node* | |
| ?node: closed_node | self_closed_node | |
| closed_node: "<" CNAME ">" xml_body "</" CNAME ">" | |
| self_closed_node: "<" CNAME "/>" | |
| xml_body: node* | any | |
| any: /[^<]+/ | |
| attr: CNAME "=" ESCAPED_STRING | |
| /////////////////////// | |
| // HTML | |
| // keeping limited xml & html processing means we don't have to | |
| // special case every html/xml before hitting the parser | |
| html: html_node | |
| unclosed_p: "<p>" // couldn't ever get this working, just doing a starts with check to handle xml & html with a good parser | |
| html_node: html_closed_node | self_closed_node | unclosed_p | |
| html_closed_node: "<" CNAME ">" html_body* "</" CNAME ">" | |
| html_body: html_node | any | |
| /////////////////////// | |
| %import common.CNAME | |
| %import common.ESCAPED_STRING | |
| %import common.SIGNED_NUMBER | |
| %import common.WS | |
| %ignore WS | |
| ''') | |
| from lark.lexer import Token | |
| token_handlers = { | |
| 'ESCAPED_STRING': eval, | |
| 'CNAME': str, | |
| 'SIGNED_NUMBER': float, | |
| } | |
| def token_to_py(token): | |
| if token.type not in token_handlers: | |
| raise Exception("unhandled token: %s (%s)" % (token.type, token)) | |
| return token_handlers[token.type](token.value) | |
| def object_handler(tree): | |
| obj = {} | |
| for c in tree.children: | |
| assert 'pair' in c.data, "No pair in object... %s" % (tree) | |
| assert len(c.children) == 2, "Problematic obj (len %s): %s" % (len(c.children), c.data) | |
| k = tree_to_py(c.children[0]) | |
| v = tree_to_py(c.children[1]) | |
| obj[k] = v | |
| return obj | |
| def kw_list_object_handler(tree): | |
| try: | |
| all_pairs = all([ | |
| 'pair' in c.data | |
| for c in tree.children | |
| ]) | |
| except Exception as e: | |
| all_pairs = False | |
| if all_pairs: | |
| return object_handler(tree) | |
| else: | |
| # it's probably just a regular list | |
| return array_handler(tree) | |
| def number_handler(num): | |
| assert len(num.children) == 1 | |
| assert num.children[0].type == "SIGNED_NUMBER" | |
| return float(num.children[0].value) | |
| def array_handler(tree): | |
| obj = [] | |
| for c in tree.children: | |
| obj.append(tree_to_py(c)) | |
| return obj | |
| def atom_handler(tree): | |
| assert len(tree.children) == 1 | |
| return tree_to_py(tree.children[0]) | |
| def module_handler(tree): | |
| return ".".join([ | |
| tree_to_py(c) | |
| for c in tree.children | |
| ]) | |
| def struct_handler(tree): | |
| obj = { | |
| 'struct_name': module_handler(tree.children[0]) | |
| } | |
| for c in tree.children[1:]: | |
| assert 'pair' in c.data, "No pair in object... %s" % (tree) | |
| assert len(c.children) == 2, "Problematic obj (len %s): %s" % (len(c.children), c.data) | |
| k = tree_to_py(c.children[0]) | |
| v = tree_to_py(c.children[1]) | |
| obj[k] = v | |
| return obj | |
| def closed_node_handler(tree): | |
| k = tree_to_py(tree.children[0]) | |
| v = [ | |
| tree_to_py(c) | |
| for c in tree.children[1:-1] | |
| ] | |
| return { | |
| k: v | |
| } | |
| def any_handler(tree): | |
| assert len(tree.children) == 1 | |
| return tree.children[0].value | |
| tree_handlers = { | |
| 'json_object': object_handler, | |
| 'number': number_handler, | |
| 'null': lambda t: None, | |
| 'false': lambda t: False, | |
| 'true': lambda t: True, | |
| 'json_array': array_handler, | |
| 'keyword_list': kw_list_object_handler, | |
| 'tuple': array_handler, | |
| 'atom': atom_handler, | |
| 'map': object_handler, | |
| 'module': module_handler, | |
| 'struct': struct_handler, | |
| 'closed_node': closed_node_handler, | |
| 'xml_body': array_handler, | |
| 'any': any_handler, | |
| } | |
| def tree_to_py(tree): | |
| if type(tree) == Token: | |
| return token_to_py(tree) | |
| if tree.data not in tree_handlers: | |
| raise Exception("unhandled tree: %s (%s)" % (tree.data, tree.children)) | |
| return tree_handlers[tree.data](tree) | |
| def parse_elixir_bag(v): | |
| if v.startswith('<?xml'): | |
| return xmltodict.parse(v) | |
| elif v.lower().startswith('<html') or v.lower().startswith('<!doctype'): | |
| p = BeautifulSoup(v) # fixes html errors | |
| return xmltodict.parse(str(p)) | |
| tree = l.parse(v) | |
| try: | |
| return tree_to_py(tree) | |
| except Exception as e: | |
| print('---E', v) | |
| raise e | |
| # Example usage: | |
| dataframe.raw_column.apply(parse_elixir_bag) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment