Last active
August 29, 2015 14:20
-
-
Save magical/2b7bc306369b81f033ed to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
import time | |
quotefix_re = re.compile( | |
r''' | |
^( | |
(?: | |
(?: | |
(?:[^\n\\"]|(?:\\{2})*\\"|\\[^\n"])* | |
(?:\\{2})* | |
" | |
){2} | |
)* | |
((?:[^\n\\"]|(?:\\{2})*\\"|\\[^\n"])*) | |
" | |
((?:[^\n\\"]|(?:\\{2})*\\"|\\[^\n"])*) | |
)$\n | |
''', | |
flags=re.MULTILINE | re.VERBOSE | |
) | |
def sb_json_regex(raw): | |
while True: | |
raw, n = quotefix_re.subn(r'\1\\n', raw, count=1) | |
if n == 0: | |
break | |
return raw | |
def sb_json_boring(s): | |
out = [] | |
state = '' | |
for c in s: | |
if state == '': | |
if c == '"': | |
out.append(c) | |
state = 'string' | |
else: | |
out.append(c) | |
elif state == 'string': | |
if c == '"': | |
out.append(c) | |
state = '' | |
elif c == '\n': | |
out.append('\\') | |
out.append('n') | |
elif c == '\\': | |
out.append(c) | |
state = 'escape' | |
else: | |
out.append(c) | |
elif state == 'escape': | |
out.append(c) | |
state = 'string' | |
else: | |
assert 0, "impossible" | |
return ''.join(out) | |
def timeit(f, name, s): | |
now = time.time() | |
f(s) | |
t = time.time() - now | |
r = s | |
if len(r) > 20: | |
r = r[:20] + "..." | |
print("%s: s=%r len(s)=%d: %fs" % (name, r, len(s), t)) | |
def do(s): | |
timeit(sb_json_regex, "regex", s) | |
timeit(sb_json_boring, "boring", s) | |
assert sb_json_regex(s) == sb_json_boring(s) | |
do("{}") | |
do("""{"hello": "world"}""") | |
do(r"""{"\"\\ \n": ""}""") | |
do("{\n" + """ "this\nis an example": "of a very\nlong json file\nwith embedded newlines",\n""" * 100 + """ "the": "end" } """) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
regex: s='{}' len(s)=2: 0.000076s | |
boring: s='{}' len(s)=2: 0.000005s | |
regex: s='{"hello": "world"}' len(s)=18: 0.000034s | |
boring: s='{"hello": "world"}' len(s)=18: 0.000013s | |
regex: s='{"\\"\\\\ \\n": ""}' len(s)=15: 0.000026s | |
boring: s='{"\\"\\\\ \\n": ""}' len(s)=15: 0.000011s | |
regex: s='{\n "this\nis an exa...' len(s)=7618: 0.855799s | |
boring: s='{\n "this\nis an exa...' len(s)=7618: 0.003821s |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment