Last active
April 17, 2022 04:48
-
-
Save blueset/78cc54d6da052c74ff105ff80bde025d to your computer and use it in GitHub Desktop.
A rough JSON parser in Python
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
from typing import Any, Tuple, Union | |
"""A very rough JSON parser. | |
Implementing the standard outlined in https://www.json.org/json-en.html | |
Number parsing is handled by Python. | |
Usage: | |
>>> data, _ = parse_json('{"key": ["value", -1e20, true, false, null]}') | |
>>> assert data == {"key": ["value", -1e20, True, False, None]} | |
""" | |
whitespace_re = re.compile("^[ \t\n\r]+") | |
float_re = re.compile(r"-?\d+(?:\.\d+)?(?:[Ee][+-]?\d+)?") | |
single_char_escape = {"\\\\": "\\", "\\/": "/", '\\"': '"', "\\b": "\b", "\\f": "\f", "\\n": "\n", "\\r": "\r", "\\t": "\t"} | |
plain_str_content_re = re.compile(r"([^\\\"]|\n\r\t)+") | |
def leading_whitespaces(data: str) -> int: | |
match = whitespace_re.match(data) | |
if not match: | |
return 0 | |
return len(match[0]) | |
def parse_string(data: str) -> Tuple[str, int]: | |
assert data[0] == '"' | |
result = "" | |
ptr = 1 | |
while ptr < len(data): | |
if data[ptr] == "\\": | |
# Escape sequence | |
if data[ptr + 1] == "u": | |
# Unicode escape | |
result += chr(int(data[ptr + 2:ptr + 6], 16)) | |
ptr += 6 | |
else: | |
# Single character escape | |
for i in single_char_escape: | |
if data[ptr:ptr+2] == i: | |
result += single_char_escape[i] | |
ptr += 2 | |
break | |
elif data[ptr] == '"': | |
# End of string | |
ptr += 1 | |
break | |
else: | |
# Plain string | |
match = plain_str_content_re.match(data[ptr:]) | |
result += match[0] | |
ptr += len(match[0]) | |
return result, ptr | |
def pares_number(s: str) -> Tuple[Union[float, int], int]: | |
match = float_re.match(s)[0] | |
if "e" not in match and "E" not in match and "." not in match: | |
return int(match), len(match) | |
return float(match), len(match) | |
def parse_object(data: str) -> Tuple[dict, int]: | |
assert data[0] == "{" | |
result = {} | |
ptr = 1 | |
while ptr < len(data): | |
ptr += leading_whitespaces(data[ptr:]) | |
# Empty object | |
if data[ptr] == "}": | |
ptr += 1 | |
break | |
key, proc_len = parse_string(data[ptr:]) | |
ptr += proc_len | |
ptr += leading_whitespaces(data[ptr:]) | |
assert data[ptr] == ":" | |
ptr += 1 | |
value, proc_len = parse_json(data[ptr:]) | |
ptr += proc_len | |
ptr += leading_whitespaces(data[ptr:]) | |
result[key] = value | |
if data[ptr] == "}": | |
ptr += 1 | |
break | |
assert data[ptr] == "," | |
ptr += 1 | |
return result, ptr | |
def parse_array(data: str) -> Tuple[list, int]: | |
assert data[0] == "[" | |
result = [] | |
ptr = 1 | |
while ptr < len(data): | |
ptr += leading_whitespaces(data[ptr:]) | |
# Empty array | |
if data[ptr] == "]": | |
ptr += 1 | |
break | |
value, proc_len = parse_json(data[ptr:]) | |
ptr += proc_len | |
result.append(value) | |
ptr += leading_whitespaces(data[ptr:]) | |
if data[ptr] == "]": | |
ptr += 1 | |
break | |
assert data[ptr] == "," | |
ptr += 1 | |
return result, ptr | |
def parse_json(data: str) -> Tuple[Any, int]: | |
"""Returns: parsed value and the number of characters consumed.""" | |
ptr = leading_whitespaces(data) | |
if ptr == len(data): | |
return None, ptr | |
if data[ptr] == "{": | |
val, proc_chr = parse_object(data[ptr:]) | |
return val, ptr + proc_chr | |
elif data[ptr] == "[": | |
val, proc_chr = parse_array(data[ptr:]) | |
return val, ptr + proc_chr | |
elif data[ptr] == '"': | |
val, proc_chr = parse_string(data[ptr:]) | |
return val, ptr + proc_chr | |
elif data[ptr:].startswith("true"): | |
return True, ptr + 4 | |
elif data[ptr:].startswith("false"): | |
return False, ptr + 5 | |
elif data[ptr:].startswith("null"): | |
return None, ptr + 4 | |
elif data[ptr] in "-+0123456789": | |
val, proc_chr = pares_number(data[ptr:]) | |
return val, ptr + proc_chr | |
else: | |
raise ValueError(f"Unexpected character: {data[ptr:]}") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment