Created
October 19, 2015 04:11
-
-
Save Aatch/8210c43dddde8775ff73 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
extern crate smallvec; | |
use std::collections::VecDeque; | |
use std::io::Read; | |
use smallvec::SmallVec; | |
pub type LexResult<T> = std::result::Result<T, Error>; | |
/** | |
* Provides functionality for implementing a lexer. | |
*/ | |
#[derive(Clone)] | |
pub struct Lexer<R: Read> { | |
input: R, | |
lookahead: SmallVec<[u8; 8]>, | |
is_eof: bool, | |
line: u32, | |
column: u32, | |
} | |
impl<R: Read> Lexer<R> { | |
pub fn new(input: R) -> Lexer<R> { | |
Lexer { | |
input: input, | |
lookahead: SmallVec::new(), | |
is_eof: false, | |
line: 1, | |
column: 0 | |
} | |
} | |
pub fn into_input(self) -> R { | |
self.input | |
} | |
/// Read a single byte from the input | |
pub fn read_byte(&mut self) -> LexResult<u8> { | |
if self.lookahead.len() > 0 { | |
let b = self.lookahead[0]; | |
self.bump_bytes(1); | |
return Ok(b); | |
} else if self.is_eof { | |
return Err(Error::Eof); | |
} else { | |
let mut buf = [0u8]; | |
match self.input.read(&mut buf) { | |
Ok(0) => { | |
self.is_eof = true; | |
return Err(Error::Eof); | |
} | |
Err(e) => return Err(Error::Io(e)), | |
_ => () | |
} | |
if buf[0] == b'\n' { | |
self.line += 1; | |
self.column = 0; | |
} else { | |
self.column += 1; | |
} | |
return Ok(buf[0]); | |
} | |
} | |
/// Look at the next `n` bytes from the input. If there are not enough bytes, the returned | |
/// slice may be smaller than `n`. | |
pub fn peek_bytes(&mut self, n: usize) -> LexResult<&[u8]> { | |
if n <= self.lookahead.len() { | |
return Ok(&self.lookahead[0..n]); | |
} else { | |
let mut cur_len = self.lookahead.len(); | |
let to_read = n - cur_len; | |
// Fill the extra space we need with 0 | |
for _ in 0..to_read { | |
self.lookahead.push(0); | |
} | |
// Read the appropriate number of bytes from the input | |
while cur_len < self.lookahead.len() { | |
// Make the buffer | |
let buf = &mut self.lookahead[cur_len..n]; | |
match self.input.read(buf) { | |
// Read zero bytes, thats an eof, but don't treat it as an error, just stop reading | |
Ok(0) => { | |
self.is_eof = true; | |
break; | |
} | |
// Read some bytes, bump the current length of the lookahead buffer | |
Ok(n) => { | |
cur_len += n; | |
} | |
Err(e) => return Err(Error::Io(e)) | |
} | |
}; | |
// Return a slice into the lookahead buffer | |
Ok(&self.lookahead[..cur_len]) | |
} | |
} | |
/// Try to read a character from the input. | |
/// Returns None if a valid character cannot be read and input is not consumed. | |
pub fn read_char(&mut self) -> LexResult<Option<char>> { | |
let c = try!(self.peek_char()); | |
if let Some(c) = c { | |
self.bump_bytes(c.len_utf8()); | |
} | |
Ok(c) | |
} | |
/// Try to look a character from the input. | |
/// Returns None if a valid character cannot be read. | |
pub fn peek_char(&mut self) -> LexResult<Option<char>> { | |
let c = { | |
let bytes = try!(self.peek_bytes(6)); | |
let s = std::str::from_utf8(bytes); | |
match s { | |
Ok(s) if s.len() > 0 => { | |
s.chars().nth(0).unwrap() | |
} | |
_ => { | |
return Ok(None); | |
} | |
} | |
}; | |
Ok(Some(c)) | |
} | |
/// Matches the input against the given bytes, if they compare equal, the input is consumed and | |
/// true is returned. Otherwise, false is returned. | |
pub fn eat_bytes(&mut self, bytes: &[u8]) -> LexResult<bool> { | |
let eq = { | |
let look = try!(self.peek_bytes(bytes.len())); | |
look == bytes | |
}; | |
if eq { | |
self.bump_bytes(bytes.len()); | |
Ok(true) | |
} else { | |
Ok(false) | |
} | |
} | |
/// If a character read from the input matches the given character, consume the input and | |
/// return true. Otherwise, false is returned. | |
pub fn eat_char(&mut self, c: char) -> LexResult<bool> { | |
let look = try!(self.peek_char()); | |
if look == Some(c) { | |
self.bump_bytes(c.len_utf8()); | |
Ok(true) | |
} else { | |
Ok(false) | |
} | |
} | |
#[inline(always)] | |
pub fn eat_str(&mut self, s: &str) -> LexResult<bool> { | |
self.eat_bytes(s.as_bytes()) | |
} | |
/// Skip input until the first non-whitespace character. Whitespace is defined as a character | |
/// with White_Space unicode property. | |
pub fn skip_whitespace(&mut self) -> LexResult<()> { | |
loop { | |
if let Some(c) = try!(self.peek_char()) { | |
if c.is_whitespace() { | |
self.bump_bytes(c.len_utf8()); | |
} else { | |
break; | |
} | |
} | |
} | |
Ok(()) | |
} | |
/// Discard up to `n` bytes from the lookahead buffer | |
pub fn bump_bytes(&mut self, n: usize) { | |
let keep; | |
// swap the bytes we need to keep to the front of the buffer | |
if n < self.lookahead.len() { | |
keep = self.lookahead.len() - n; | |
for i in 0..keep { | |
self.lookahead.swap(i, i+n); | |
} | |
} else { | |
keep = 0; | |
} | |
for &b in &self.lookahead[keep..] { | |
if b == b'\n' { | |
self.line += 1; | |
self.column = 0; | |
} else { | |
self.column += 1; | |
} | |
} | |
for _ in 0..n { | |
self.lookahead.pop(); | |
} | |
} | |
/// Returns the position as (line, column) | |
pub fn position(&self) -> (u32, u32) { | |
(self.line, self.column) | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment