Created
March 30, 2020 04:39
-
-
Save garlicnation/6d46878abe86c5f43495517653e76428 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
use std::fs::File; | |
use std::io::{BufReader, Read}; | |
use std::iter::Peekable; | |
use std::str::Chars; | |
use std::string::String; | |
use std::vec::Vec; | |
#[derive(Debug, PartialEq, Copy, Clone)] | |
pub struct Location { | |
offset: u64, | |
line: u64, | |
col: u64, | |
} | |
#[derive(PartialEq, Debug)] | |
pub enum Token { | |
LeftParen, | |
RightParen, | |
Plus, | |
Minus, | |
Star, | |
Slash, | |
StringLiteral(String), | |
IntegerLiteral(i64), | |
EOF, | |
} | |
#[derive(Debug, PartialEq)] | |
pub struct TokenValue { | |
start: Location, | |
end: Location, | |
value: Token, | |
text: String, | |
} | |
#[derive(Debug)] | |
pub struct TokenizeErrorDetails { | |
msg: String, | |
filename: String, | |
location: Location, | |
} | |
#[derive(Debug)] | |
pub enum TokenizeError { | |
IO(std::io::Error), | |
ErrorTokenizing(TokenizeErrorDetails), | |
} | |
fn error_tokenizing(msg: &str, filename: &str, location: Location) -> TokenizeError { | |
TokenizeError::ErrorTokenizing(TokenizeErrorDetails { | |
msg: msg.to_owned(), | |
filename: filename.to_owned(), | |
location: location, | |
}) | |
} | |
struct Scanner<'a> { | |
tokens: Vec<TokenValue>, | |
contents: Peekable<Chars<'a>>, | |
filename: String, | |
offset: u64, | |
line: u64, | |
col: u64, | |
newline: bool, | |
// Start of the current token | |
start: Location, | |
// Current token contents | |
current_token: String, | |
has_error: bool, | |
errors: Vec<TokenizeErrorDetails>, | |
} | |
impl<'a> Scanner<'a> { | |
pub fn new(contents: &'a String, filename: &str) -> Scanner<'a> { | |
Scanner { | |
tokens: Vec::new(), | |
contents: contents.chars().peekable(), | |
col: 0, | |
line: 0, | |
offset: 0, | |
filename: filename.to_owned(), | |
newline: false, | |
start: Location { | |
col: 0, | |
line: 0, | |
offset: 0, | |
}, | |
current_token: String::new(), | |
has_error: false, | |
errors: Vec::new(), | |
} | |
} | |
pub fn loc(&self) -> Location { | |
Location { | |
offset: self.offset, | |
line: self.line, | |
col: self.col, | |
} | |
} | |
pub fn advance(&mut self) -> Option<char> { | |
let next = self.contents.next(); | |
self.offset += 1; | |
if self.newline { | |
self.line += 1; | |
self.col = 0; | |
self.newline = false; | |
} | |
self.col += 1; | |
match next { | |
Some(c) => { | |
if c == '\n' { | |
self.newline = true; | |
} | |
self.current_token.push(c); | |
} | |
_ => {} | |
} | |
next | |
} | |
pub fn peek(&mut self) -> Option<char> { | |
// Unwrap the peek reference | |
match self.contents.peek() { | |
None => None, | |
Some(&ch) => Some(ch), | |
} | |
} | |
pub fn push_token(&mut self, t: Token) { | |
self.tokens.push(TokenValue { | |
start: self.start, | |
end: self.loc(), | |
value: t, | |
text: self.current_token.to_owned(), | |
}); | |
self.start = self.loc(); | |
self.current_token.clear() | |
} | |
pub fn reset_token(&mut self) { | |
self.start = self.loc(); | |
self.current_token.clear(); | |
} | |
pub fn scan_number(&mut self) -> Option<Token> { | |
loop { | |
match self.peek() { | |
Some(c) if c.is_ascii_digit() => { | |
self.advance(); | |
continue; | |
}, | |
_ => break, | |
} | |
} | |
match self.current_token.parse() { | |
Ok(n) => Some(Token::IntegerLiteral(n)), | |
Err(e) => { | |
self.add_error(&format!("{}", e)); | |
None | |
} | |
} | |
} | |
pub fn add_error(&mut self, msg: &str) { | |
self.has_error = true; | |
self.errors.push(TokenizeErrorDetails { | |
msg: msg.to_owned(), | |
filename: self.filename.to_owned(), | |
location: self.loc(), | |
}); | |
} | |
} | |
pub fn scan<'a, R: std::io::Read>( | |
source: &mut BufReader<R>, | |
file: &'a str, | |
) -> Result<Vec<TokenValue>, TokenizeError> { | |
let mut buffer = String::new(); | |
let read_result = source.read_to_string(&mut buffer); | |
match read_result { | |
Err(e) => return Err(TokenizeError::IO(e)), | |
_ => {} | |
} | |
let mut scanner = Scanner::new(&buffer, file); | |
loop { | |
let c = match scanner.advance() { | |
None => break, | |
Some(c) => c | |
}; | |
dbg!(c); | |
use lexer::Token::*; | |
let token = match c { | |
'+' => Some(Plus), | |
'-' => Some(Minus), | |
'*' => Some(Star), | |
'/' => Some(Slash), | |
'(' => Some(LeftParen), | |
')' => Some(RightParen), | |
' ' | '\t' => None, | |
'0' ..= '9' => scanner.scan_number(), | |
_ => { | |
eprintln!("unexpected character '{}' at {:?}", c, scanner.loc()); | |
scanner.has_error = true; | |
None | |
} | |
}; | |
match token { | |
Some(t) => scanner.push_token(t), | |
None => scanner.reset_token(), | |
} | |
} | |
if scanner.has_error { | |
return Err(error_tokenizing("unable to tokenize, check logs", file, scanner.loc())) | |
} | |
scanner.push_token(Token::EOF); | |
return Ok(scanner.tokens); | |
} | |
#[test] | |
fn test_scanner() { | |
let mut input = BufReader::new("+ 376 4 5 (+ 2 3) / +****".as_bytes()); | |
let scanned = scan(&mut input, "filename.txt"); | |
assert!( | |
scanned.is_ok(), | |
"scanned is ok" | |
); | |
for token in scanned.unwrap() { | |
println!("{:?}", token); | |
} | |
} | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment