Skip to content

Instantly share code, notes, and snippets.

@goodmami
Created August 30, 2018 21:58
Show Gist options
  • Save goodmami/686385b4b39a3bac00fbbe78a5cda6c8 to your computer and use it in GitHub Desktop.
Save goodmami/686385b4b39a3bac00fbbe78a5cda6c8 to your computer and use it in GitHub Desktop.
Comparing Lark and Parsimonious on JSON parsing
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# usage: python3 lark-parsimonious.py [TESTNUM]
#
# Where TESTNUM is one of:
#
# 1. Parsimonious with the faster grammar (tree-only)
# 2. Parsimonious with the faster grammar (transform data)
# 3. Parsimonious with the slower grammar (tree-only)
# 4. Parsimonious with the slower grammar (transform data)
# 5. Lark with LALR (tree-only)
# 6. Lark with LALR (tree-less transformation)
# 7. Lark with LALR (tree and transformation)
# 8. json module from the Python standard library
#
# If TESTNUM is not given, all tests are run.
#
# Also, it expects a file "generated.json" to be in the current directory.
# Such a file can be created here:
# https://www.json-generator.com/
#
# Requirements:
# * lark-parser
# * parsimonious
#
# Author: Michael Wayne Goodman
# Note that the original (slower) Parsimonious grammar is from:
# https://gist.github.com/reclosedev/5222560
# And the Lark grammar is from:
# https://github.com/lark-parser/lark/blob/master/docs/json_tutorial.md
# See these URLs for any license restrictions of the respective sources.
import argparse
import ast
import timeit
from parsimonious.grammar import Grammar
from parsimonious.nodes import NodeVisitor
from lark import Lark, Transformer, v_args
argparser = argparse.ArgumentParser()
argparser.add_argument('testnum', type=int, nargs='?', default=0)
args = argparser.parse_args()
ParsimoniousJson1 = Grammar(r'''
Start = ~"\s*" ( Object / Array ) ~"\s*"
Object = ~"{\s*" Members? ~"\s*}"
Members = MappingComma* Mapping
MappingComma = Mapping ~"\s*,\s*"
Mapping = DQString ~"\s*:\s*" Value
Array = ~"\[\s*" Items? ~"\s*\]"
Items = ValueComma* Value
ValueComma = Value ~"\s*,\s*"
Value = Object / Array / DQString
/ TrueVal / FalseVal / NullVal / Float / Integer
TrueVal = "true"
FalseVal = "false"
NullVal = "null"
DQString = ~"\"[^\"\\\\]*(?:\\\\.[^\"\\\\]*)*\""
Float = ~"[-+]?(\d+(\.\d*)?|\.\d+)([eE][-+]?\d+)?"
Integer = ~"[-+]?\d+"
''')
class ParsimoniousJson1Visitor(NodeVisitor):
def generic_visit(self, node, visited_children):
return visited_children or node
# helper functions for generic patterns
def combine_many_or_one(self, node, children):
""" Usable for following pattern:
values = value_and_comma* value
"""
members, member = children
if isinstance(members, list):
return members + [member]
return [member]
def lift_first_child(self, node, visited_children):
""" Returns first child from `visited_children`, e.g. for::
rule = item optional another_optional?
returns `item`
"""
return visited_children[0]
# visitors
visit_Value = visit_MappingComma = visit_ValueComma = lift_first_child
visit_Members = combine_many_or_one
def visit_Start(self, node, children):
return children[1]
def visit_Object(self, node, children):
_, members, _ = children
if isinstance(members, list):
members = members[0]
else:
members = []
return dict(members)
def visit_Array(self, node, children):
_, values, _ = children
if isinstance(values, list):
values = values[0]
else:
values = []
return values
def visit_Mapping(self, node, children):
key, _, value = children
return key, value
def visit_DQString(self, node, visited_children):
# produce unicode for strings
return ast.literal_eval("u" + node.text)
def visit_Float(self, node, visited_children):
return float(node.text)
def visit_Integer(self, node, visited_children):
return int(node.text)
def visit_TrueVal(self, node, visited_children):
return True
def visit_FalseVal(self, node, visited_children):
return False
def visit_NullVal(self, node, visited_children):
return None
# taken from https://gist.github.com/reclosedev/5222560
# Fixed number to allow 1e2 floats
# Changed NodeVisitor to work with Python3
ParsimoniousJson2 = Grammar(r'''
json_file = ws? json ws?
json = object / array
object = "{" members "}"
members = member_and_comma* member
member_and_comma = member comma
member = ws? string ws? ":" value
array = "[" values "]"
values = value_and_comma* value
value_and_comma = value comma
value = ws? (true / false / object / array / number / string / null) ws?
true = "true"
false = "false"
null = "null"
number = ~r"-?(0|([1-9][0-9]*))(\.[0-9]+)?([Ee][+-]?[0-9]+)?"
string = ~"\"[^\"\\\\]*(?:\\\\.[^\"\\\\]*)*\""is
ws = ~r"\s+"
comma = ws? "," ws?''')
class ParsimoniousJson2Visitor(NodeVisitor):
""" Produces Python objects from parsed JSON grammar tree
"""
def generic_visit(self, node, visited_children):
return visited_children or node
# helper functions for generic patterns
def combine_many_or_one(self, node, children):
""" Usable for following pattern:
values = value_and_comma* value
"""
members, member = children
if isinstance(members, list):
return members + [member]
return [member]
def lift_first_child(self, node, visited_children):
""" Returns first child from `visited_children`, e.g. for::
rule = item optional another_optional?
returns `item`
"""
return visited_children[0]
# visitors
visit_json = lift_first_child
def visit_json_file(self, node, children):
eol1, json_, eol = children
return json_
def visit_object(self, node, children):
cb1, members, cb2 = children
return dict(members)
def visit_array(self, node, children):
cb1, values, cb2 = children
return values
visit_member_and_comma = visit_value_and_comma = lift_first_child
visit_values = visit_members = combine_many_or_one
def visit_member(self, node, children):
_1, name, _2, colon, value = children
return name, value
def visit_value(self, node, children):
_1, value, _2 = children
return value[0]
def visit_string(self, node, visited_children):
# produce unicode for strings
return ast.literal_eval("u" + node.text)
def visit_number(self, node, visited_children):
return ast.literal_eval(node.text)
def visit_true(self, node, visited_children):
return True
def visit_false(self, node, visited_children):
return False
def visit_null(self, node, visited_children):
return None
# from https://github.com/lark-parser/lark/blob/master/docs/json_tutorial.md
lark_json_grammar = r"""
?start: value
?value: object
| array
| string
| SIGNED_NUMBER -> number
| "true" -> true
| "false" -> false
| "null" -> null
array : "[" [value ("," value)*] "]"
object : "{" [pair ("," pair)*] "}"
pair : string ":" value
string : ESCAPED_STRING
%import common.ESCAPED_STRING
%import common.SIGNED_NUMBER
%import common.WS
%ignore WS
"""
class TreeToJson(Transformer):
@v_args(inline=True)
def string(self, s):
return s[1:-1].replace('\\"', '"')
array = list
pair = tuple
object = dict
number = v_args(inline=True)(float)
null = lambda self, _: None
true = lambda self, _: True
false = lambda self, _: False
LarkJson = Lark(lark_json_grammar, parser='lalr', lexer='standard')
LarkJsonTreeless = Lark(
lark_json_grammar, parser='lalr', lexer='standard', transformer=TreeToJson())
s = open('generated.json').read()
if args.testnum in (0, 1):
print(
'Parsimonious 1 (faster grammar; tree only)\n ',
timeit.timeit(
'ParsimoniousJson1.match(s)',
setup='from __main__ import ParsimoniousJson1, s',
number=1
)
)
if args.testnum in (0, 2):
print(
'parsimonious 1 (faster grammar; transformed data)\n ',
timeit.timeit(
'v.visit(ParsimoniousJson1.match(s))',
setup='from __main__ import ParsimoniousJson1, s, ParsimoniousJson1Visitor; v = ParsimoniousJson1Visitor()',
number=1
)
)
if args.testnum in (0, 3):
print(
'parsimonious 2 (original grammar; tree only)\n ',
timeit.timeit(
'ParsimoniousJson2.match(s)',
setup='from __main__ import ParsimoniousJson2, s',
number=1
)
)
if args.testnum in (0, 4):
print(
'parsimonious 2 (original grammar; transformed data)\n ',
timeit.timeit(
'v.visit(ParsimoniousJson2.match(s))',
setup='from __main__ import ParsimoniousJson2, s, ParsimoniousJson2Visitor; v = ParsimoniousJson2Visitor()',
number=1
)
)
if args.testnum in (0, 5):
print(
'lark (lalr; tree only)\n ',
timeit.timeit(
'LarkJson.parse(s)',
setup='from __main__ import LarkJson, s',
number=1
)
)
if args.testnum in (0, 6):
print(
'lark (lalr; tree-less transformation)\n ',
timeit.timeit(
'LarkJsonTreeless.parse(s)',
setup='from __main__ import LarkJsonTreeless, s',
number=1
)
)
if args.testnum in (0, 7):
print(
'lark (lalr; tree and transformation)\n ',
timeit.timeit(
't.transform(LarkJson.parse(s))',
setup='from __main__ import LarkJson, TreeToJson, s; t = TreeToJson()',
number=1
)
)
if args.testnum in (0, 8):
print(
'json (Python standard library)\n ',
timeit.timeit(
'json.loads(s)',
setup='from __main__ import s; import json',
number=1
)
)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment