Last active
August 25, 2019 09:24
-
-
Save tyru/b8fcf210bd84da3cf26e5d850ea57029 to your computer and use it in GitHub Desktop.
JSON Parser in Rust
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#![feature(slice_patterns)] | |
use std::io; | |
use std::fmt; | |
use std::str; | |
use std::result; | |
use std::iter::Iterator; | |
// TODO: take deparse option from command-line arguments | |
fn main() { | |
let mut input = String::new(); | |
io::stdin().read_line(&mut input) | |
.expect("Failed to read json"); | |
let result = | |
tokenize(&input) | |
.and_then(|tokens| parse(&tokens[..])) | |
.map(|json| deparse(&json, DEFAULT_DEPARSE_OPTION, 0)); | |
match result { | |
Ok(output) => println!("{}", output), | |
Err(msg) => println!("error: {}", msg), | |
}; | |
} | |
#[derive(Clone, Debug)] | |
enum Token { | |
Number(f64), | |
String(std::string::String), | |
True, | |
False, | |
Null, | |
Colon, | |
Comma, | |
LeftBrace, | |
RightBrace, | |
LeftBracket, | |
RightBracket, | |
} | |
// TODO: https://stackoverflow.com/a/50334049/6321251 | |
impl fmt::Display for Token { | |
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { | |
write!(f, "{}", match self { | |
Token::Number(f) => ["number(", f.to_string().as_str(), ")"].concat(), | |
Token::String(s) => ["string(", s, ")"].concat(), | |
Token::True => "true".to_string(), | |
Token::False => "false".to_string(), | |
Token::Null => "null".to_string(), | |
Token::Colon => "colon".to_string(), | |
Token::Comma => "comma".to_string(), | |
Token::LeftBrace => "left brace".to_string(), | |
Token::RightBrace => "right brace".to_string(), | |
Token::LeftBracket => "left bracket".to_string(), | |
Token::RightBracket => "right bracket".to_string(), | |
}) | |
} | |
} | |
impl Eq for Token {} | |
impl PartialEq for Token { | |
fn eq(&self, other: &Token) -> bool { | |
// FIXME: for Token::Number | |
self.to_string() == other.to_string() | |
} | |
} | |
// I think lexer is not necessary for JSON... | |
// but this is practice :) | |
type TokenizeResult<T> = result::Result<T, String>; | |
fn tokenize(input: &str) -> TokenizeResult<Vec<Token>> { | |
let bytes: Vec<char> = input.chars().collect(); | |
let mut pos = 0; | |
let mut tokens = Vec::new(); | |
let end = bytes.len(); | |
while pos < end { | |
match bytes.get(pos..end) { | |
Some(&[c, ..]) if c == '\u{0020}' || c == '\u{000D}' || c == '\u{000A}' || c == '\u{0009}' => | |
pos += 1, | |
Some(&[':', ..]) => { | |
tokens.push(Token::Colon); | |
pos += 1 | |
}, | |
Some(&[',', ..]) => { | |
tokens.push(Token::Comma); | |
pos += 1 | |
}, | |
Some(&['{', ..]) => { | |
tokens.push(Token::LeftBrace); | |
pos += 1 | |
}, | |
Some(&['}', ..]) => { | |
tokens.push(Token::RightBrace); | |
pos += 1 | |
}, | |
Some(&['[', ..]) => { | |
tokens.push(Token::LeftBracket); | |
pos += 1 | |
}, | |
Some(&[']', ..]) => { | |
tokens.push(Token::RightBracket); | |
pos += 1 | |
}, | |
Some(&[d, ..]) if is_digit(d) || d == '-' => | |
match tokenize_number(&bytes, &mut pos) { | |
Ok(tok) => tokens.push(tok), | |
Err(msg) => return Err(msg), | |
}, | |
Some(&['"', ..]) => | |
match tokenize_string(&bytes, &input, &mut pos) { | |
Ok(tok) => tokens.push(tok), | |
Err(msg) => return Err(msg), | |
}, | |
Some(&['t', 'r', 'u', 'e', ..]) => { | |
tokens.push(Token::True); | |
pos += 4 | |
}, | |
Some(&['f', 'a', 'l', 's', 'e', ..]) => { | |
tokens.push(Token::False); | |
pos += 5 | |
}, | |
Some(&['n', 'u', 'l', 'l', ..]) => { | |
tokens.push(Token::Null); | |
pos += 4 | |
}, | |
// TODO: show also the text at error | |
_ => | |
return Err(format!("tokenizer: unknown token at pos {}", pos + 1)), | |
} | |
} | |
Ok(tokens) | |
} | |
fn is_digit(c: char) -> bool { | |
'0' <= c && c <= '9' | |
} | |
fn tokenize_number(bytes: &Vec<char>, pos: &mut usize) -> result::Result<Token, String> { | |
let begin = *pos; | |
// integer | |
if let Some(&['-']) = bytes.get(*pos..*pos+1) { | |
*pos += 1 | |
} | |
let digit_begin = *pos; | |
while let Some(&[d]) = bytes.get(*pos..*pos+1) { | |
if !is_digit(d) { | |
break | |
} | |
*pos += 1 | |
} | |
if *pos - digit_begin > 1 { | |
let digits: String = bytes.get(begin..*pos).unwrap_or(&[]).into_iter().collect(); | |
if let Some(&['0']) = bytes.get(digit_begin..digit_begin+1) { | |
return Err(format!("invalid number syntax: the number of 2 or more digits must not start with zero: {}", digits)) | |
} | |
} | |
// fraction | |
if let Some(&['.']) = bytes.get(*pos..*pos+1) { | |
*pos += 1; | |
while let Some(&[d]) = bytes.get(*pos..*pos+1) { | |
if !is_digit(d) { | |
break | |
} | |
*pos += 1 | |
} | |
} | |
// exponent | |
if let Some(&['E']) | Some(&['e']) = bytes.get(*pos..*pos+1) { | |
*pos += 1; | |
if let Some(&['+']) | Some(&['-']) = bytes.get(*pos..*pos+1) { | |
*pos += 1; | |
} | |
while let Some(&[d]) = bytes.get(*pos..*pos+1) { | |
if !is_digit(d) { | |
break | |
} | |
*pos += 1 | |
} | |
} | |
let digits: String = bytes.get(begin..*pos).unwrap_or(&[]).into_iter().collect(); | |
if let Ok(n) = digits.parse() { | |
Ok(Token::Number(n)) | |
} else { | |
unreachable!() | |
} | |
} | |
fn tokenize_string(bytes: &Vec<char>, input: &str, pos: &mut usize) -> result::Result<Token, String> { | |
let begin = *pos; | |
*pos += 1; | |
let end = bytes.len(); | |
while *pos < end { | |
match bytes.get(*pos..end) { | |
Some(&['"', ..]) => { | |
*pos += 1; | |
break | |
}, | |
Some(&['\\', '"', ..]) => *pos += 2, | |
Some(&['\\', '\\', ..]) => *pos += 2, | |
Some(&['\\', '/', ..]) => *pos += 2, | |
Some(&['\\', 'b', ..]) => *pos += 2, | |
Some(&['\\', 'f', ..]) => *pos += 2, | |
Some(&['\\', 'n', ..]) => *pos += 2, | |
Some(&['\\', 'r', ..]) => *pos += 2, | |
Some(&['\\', 't', ..]) => *pos += 2, | |
Some(&['\\', 'u', h1, h2, h3, h4, ..]) if is_hex(h1) && is_hex(h2) && is_hex(h3) && is_hex(h4) => *pos += 6, | |
Some(&[c, ..]) if '\u{0020}' <= c && c <= '\u{10ffff}' && c != '"' && c != '\\' => *pos += 1, | |
_ => return Err(format!("tokenizer: unexpected EOF inside double quote at {}", pos)), | |
}; | |
} | |
if let Some(s) = input.get(begin..*pos) { | |
Ok(Token::String(s.to_string())) | |
} else { | |
unreachable!() | |
} | |
} | |
fn is_hex(c: char) -> bool { | |
('0' <= c && c <= '9') || ('A' <= c && c <= 'F') || ('a' <= c && c <= 'f') | |
} | |
#[derive(Clone)] | |
enum JSON { | |
Object(Vec<(JSON, JSON)>), | |
Array(Vec<JSON>), | |
Number(f64), | |
String(std::string::String), | |
True, | |
False, | |
Null, | |
} | |
impl fmt::Display for JSON { | |
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { | |
write!(f, "{}", deparse(self, DEFAULT_DEPARSE_OPTION, 0)) | |
} | |
} | |
type ParseResult<T> = result::Result<T, String>; | |
fn parse(tokens: &[Token]) -> ParseResult<JSON> { | |
let (result, rest) = do_parse(tokens); | |
if let Err(_) = result { | |
return result | |
} | |
if !rest.is_empty() { | |
// TODO | |
// return Err(format!("parser: unexpected tokens after {} at {}")) | |
return Err(format!("parser: unexpected tokens: {:?}", rest)) | |
} | |
result | |
} | |
fn do_parse(tokens: &[Token]) -> (ParseResult<JSON>, &[Token]) { | |
if tokens.is_empty() { | |
return (Err("no json input".to_string()), tokens) | |
} | |
match &tokens[0] { | |
Token::LeftBrace => parse_object(&tokens[1..]), | |
Token::LeftBracket => parse_array(&tokens[1..]), | |
Token::Number(n) => (Ok(JSON::Number(*n)), &tokens[1..]), | |
Token::String(s) => (Ok(JSON::String(s.to_string())), &tokens[1..]), | |
Token::True => (Ok(JSON::True), &tokens[1..]), | |
Token::False => (Ok(JSON::False), &tokens[1..]), | |
Token::Null => (Ok(JSON::Null), &tokens[1..]), | |
token => (Err(format!("unknown token {}", token)), tokens), | |
} | |
} | |
fn parse_object(tokens: &[Token]) -> (ParseResult<JSON>, &[Token]) { | |
let mut pairs: Vec<(JSON, JSON)> = Vec::new(); | |
match parse_object_rec(tokens, &mut pairs) { | |
Ok(rest) => (Ok(JSON::Object(pairs)), rest), | |
Err(msg) => (Err(msg), tokens), | |
} | |
} | |
fn parse_object_rec<'a, 'b>(tokens: &'a [Token], pairs: &'b mut Vec<(JSON, JSON)>) -> ParseResult<&'a [Token]> { | |
match tokens { | |
[] => | |
return Err("unexpected EOF while parsing object".to_string()), | |
[Token::RightBrace, ..] => | |
return Ok(&tokens[1..]), | |
[ref key @ Token::String(_), Token::Colon, ..] => { | |
let key_result = parse(&[key.clone()]); | |
let (value_result, rest) = do_parse(&tokens[2..]); | |
match [key_result, value_result] { | |
[Ok(k), Ok(v)] => pairs.push((k, v)), | |
[Err(msg), _] | [_, Err(msg)] => return Err(msg), | |
}; | |
match rest { | |
[Token::RightBrace, ..] => | |
Ok(&rest[1..]), | |
[Token::Comma, tok, ..] => | |
if *tok == Token::RightBrace { | |
Err("invalid object syntax: got right brace after comma".to_string()) | |
} else { | |
parse_object_rec(&rest[1..], pairs) | |
}, | |
[] => | |
Err("invalid object syntax: expected right brace or comma but got EOF".to_string()), | |
_ => | |
Err(format!("invalid object syntax: expected right brace or comma but got {}", rest[0])), // TODO: more readable message | |
} | |
}, | |
_ => | |
Err("invalid object syntax: key string and colon are expected".to_string()) // TODO: more readable message | |
} | |
} | |
fn parse_array(tokens: &[Token]) -> (ParseResult<JSON>, &[Token]) { | |
let mut elems: Vec<JSON> = Vec::new(); | |
match parse_array_rec(tokens, &mut elems) { | |
Ok(rest) => (Ok(JSON::Array(elems)), rest), | |
Err(msg) => (Err(msg), tokens), | |
} | |
} | |
fn parse_array_rec<'a, 'b>(tokens: &'a [Token], elems: &'b mut Vec<JSON>) -> ParseResult<&'a [Token]> { | |
match tokens { | |
[] => | |
Err("unexpected EOF while parsing array".to_string()), | |
[Token::RightBracket, ..] => | |
Ok(&tokens[1..]), | |
_ => { | |
let (result, rest) = do_parse(tokens); | |
match result { | |
Ok(v) => elems.push(v), | |
Err(msg) => return Err(msg), | |
}; | |
match rest { | |
[Token::RightBracket, ..] => | |
Ok(&rest[1..]), | |
[Token::Comma, tok, ..] => | |
if *tok == Token::RightBracket { | |
Err("invalid array syntax: got right bracket after comma".to_string()) | |
} else { | |
parse_array_rec(&rest[1..], elems) | |
}, | |
[] => | |
Err("invalid array syntax: expected right bracket or comma but got EOF".to_string()), // TODO: more readable message | |
_ => | |
Err(format!("invalid array syntax: expected right bracket or comma but got {}", rest[0])), // TODO: more readable message | |
} | |
}, | |
} | |
} | |
const DEFAULT_DEPARSE_OPTION: &DeparseOption = &DeparseOption { | |
left_brace: "{", | |
right_brace: "}", | |
left_bracket: "[", | |
right_bracket: "]", | |
indent: " ", | |
colon: ": ", | |
comma: ",", | |
newline: "\n", | |
}; | |
struct DeparseOption<'a> { | |
left_brace: &'a str, | |
right_brace: &'a str, | |
left_bracket: &'a str, | |
right_bracket: &'a str, | |
indent: &'a str, | |
colon: &'a str, | |
comma: &'a str, | |
newline: &'a str, | |
} | |
fn deparse(json: &JSON, option: &DeparseOption, nest: usize) -> String { | |
match json { | |
JSON::Object(pairs) => | |
[option.left_brace.to_string(), | |
option.newline.to_string(), | |
pairs.iter() | |
.map(|(k, v)| | |
[ | |
option.indent.to_string().repeat(nest + 1), | |
deparse(k, option, nest + 1), | |
option.colon.to_string(), | |
deparse(v, option, nest + 1) | |
].concat() | |
) | |
.collect::<Vec<String>>() | |
.join([option.comma, option.newline].concat().as_str()), | |
option.newline.to_string(), | |
option.indent.to_string().repeat(nest), | |
option.right_brace.to_string()].concat(), | |
JSON::Array(elems) => | |
[option.left_bracket.to_string(), | |
option.newline.to_string(), | |
elems.iter() | |
.map(|v| | |
[ | |
option.indent.to_string().repeat(nest + 1), | |
deparse(v, option, nest + 1) | |
].concat() | |
) | |
.collect::<Vec<String>>() | |
.join([option.comma, option.newline].concat().as_str()), | |
option.newline.to_string(), | |
option.indent.to_string().repeat(nest), | |
option.right_bracket.to_string()].concat(), | |
JSON::Number(n) => | |
n.to_string(), | |
JSON::String(s) => | |
s.to_string(), | |
JSON::True => | |
"true".to_string(), | |
JSON::False => | |
"false".to_string(), | |
JSON::Null => | |
"null".to_string(), | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment