Created
January 16, 2018 23:45
-
-
Save bootandy/aa12e658a3c7d2e0e5c80e5f05833cd4 to your computer and use it in GitHub Desktop.
rust: better tokenizer
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| fn update_state<'a>(s :&'a str, tokens :&mut Vec<String>, match_point : &Captures<'a>) -> &'a str { | |
| let new_s = &s[match_point.get(1).unwrap().end()..]; | |
| tokens.push(match_point.get(1).unwrap().as_str().to_string()); | |
| new_s | |
| } | |
| pub fn tokenizer(s_in: &str) -> Reader { | |
| let mut s = &s_in[0..]; | |
| let brackets = regex!(r###"^[\s,]*([\(\)\{\}\[\]])[\s,]*"###); | |
| let digits = regex!(r"^[\s,]*(-?\d+)"); | |
| let operands = regex!(r"^[\s,]*(\*{1,2}|[\+\-\\])"); //{} is greedy to detect ** instead of: * | |
| let alphas = regex!(r###"^[\s,]*([\w\d:"-]+)"###); | |
| let mut tokens = vec![]; | |
| let all_regexs = vec![&brackets, &digits, &operands, &alphas] ; | |
| let empty = regex!(r"^[\s,]+$"); | |
| while s.len() > 0 && !empty.is_match(s) { | |
| for regex in &all_regexs { | |
| let rb = regex.captures(s); | |
| if !rb.is_none() { | |
| s = update_state(s, &mut tokens, &rb.unwrap()); | |
| break | |
| } | |
| } | |
| } | |
| Reader{tokens: tokens, position: 0} | |
| } |
Author
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
good spot.