Aatch · October 19, 2015 04:11
diff --git a/lexer.rs b/lexer.rs
 extern crate smallvec;

 use std::collections::VecDeque;
 use std::io::Read;

 use smallvec::SmallVec;

 pub type LexResult<T> = std::result::Result<T, Error>;

 /**
 * Provides functionality for implementing a lexer.
 */
 #[derive(Clone)]
 pub struct Lexer<R: Read> {
    input: R,
    lookahead: SmallVec<[u8; 8]>,
    is_eof: bool,
    line: u32,
    column: u32,
 }

 impl<R: Read> Lexer<R> {
    pub fn new(input: R) -> Lexer<R> {
        Lexer {
            input: input,
            lookahead: SmallVec::new(),
            is_eof: false,
            line: 1,
            column: 0
        }
    }

    pub fn into_input(self) -> R {
        self.input
    }

    /// Read a single byte from the input
    pub fn read_byte(&mut self) -> LexResult<u8> {
        if self.lookahead.len() > 0 {
            let b = self.lookahead[0];
            self.bump_bytes(1);
            return Ok(b);
        } else if self.is_eof {
            return Err(Error::Eof);
        } else {
            let mut buf = [0u8];
            match self.input.read(&mut buf) {
                Ok(0) => {
                    self.is_eof = true;
                    return Err(Error::Eof);
                }
                Err(e) => return Err(Error::Io(e)),
                _ => ()
            }

            if buf[0] == b'\n' {
                self.line += 1;
                self.column = 0;
            } else {
                self.column += 1;
            }

            return Ok(buf[0]);
        }
    }

    /// Look at the next `n` bytes from the input. If there are not enough bytes, the returned
    /// slice may be smaller than `n`.
    pub fn peek_bytes(&mut self, n: usize) -> LexResult<&[u8]> {
        if n <= self.lookahead.len() {
            return Ok(&self.lookahead[0..n]);
        } else {
            let mut cur_len = self.lookahead.len();
            let to_read = n - cur_len;

            // Fill the extra space we need with 0
            for _ in 0..to_read {
                self.lookahead.push(0);
            }

            // Read the appropriate number of bytes from the input
            while cur_len < self.lookahead.len() {
                // Make the buffer
                let buf = &mut self.lookahead[cur_len..n];
                match self.input.read(buf) {
                    // Read zero bytes, thats an eof, but don't treat it as an error, just stop reading
                    Ok(0) => {
                        self.is_eof = true;
                        break;
                    }
                    // Read some bytes, bump the current length of the lookahead buffer
                    Ok(n) => {
                        cur_len += n;
                    }
                    Err(e) => return Err(Error::Io(e))
                }
            };

            // Return a slice into the lookahead buffer
            Ok(&self.lookahead[..cur_len])
        }
    }

    /// Try to read a character from the input.
    /// Returns None if a valid character cannot be read and input is not consumed.
    pub fn read_char(&mut self) -> LexResult<Option<char>> {
        let c = try!(self.peek_char());
        if let Some(c) = c {
            self.bump_bytes(c.len_utf8());
        }
        Ok(c)
    }

    /// Try to look a character from the input.
    /// Returns None if a valid character cannot be read.
    pub fn peek_char(&mut self) -> LexResult<Option<char>> {
        let c = {
            let bytes = try!(self.peek_bytes(6));
            let s = std::str::from_utf8(bytes);
            match s {
                Ok(s) if s.len() > 0 => {
                    s.chars().nth(0).unwrap()
                }
                _ => {
                    return Ok(None);
                }
            }
        };
        Ok(Some(c))
    }

    /// Matches the input against the given bytes, if they compare equal, the input is consumed and
    /// true is returned. Otherwise, false is returned.
    pub fn eat_bytes(&mut self, bytes: &[u8]) -> LexResult<bool> {
        let eq = {
            let look = try!(self.peek_bytes(bytes.len()));
            look == bytes
        };
        if eq {
            self.bump_bytes(bytes.len());
            Ok(true)
        } else {
            Ok(false)
        }
    }

    /// If a character read from the input matches the given character, consume the input and
    /// return true. Otherwise, false is returned.
    pub fn eat_char(&mut self, c: char) -> LexResult<bool> {
        let look = try!(self.peek_char());
        if look == Some(c) {
            self.bump_bytes(c.len_utf8());
            Ok(true)
        } else {
            Ok(false)
        }
    }

    #[inline(always)]
    pub fn eat_str(&mut self, s: &str) -> LexResult<bool> {
        self.eat_bytes(s.as_bytes())
    }

    /// Skip input until the first non-whitespace character. Whitespace is defined as a character
    /// with White_Space unicode property.
    pub fn skip_whitespace(&mut self) -> LexResult<()> {
        loop {
            if let Some(c) = try!(self.peek_char()) {
                if c.is_whitespace() {
                    self.bump_bytes(c.len_utf8());
                } else {
                    break;
                }
            }
        }

        Ok(())
    }

    /// Discard up to `n` bytes from the lookahead buffer
    pub fn bump_bytes(&mut self, n: usize) {
        let keep;
        // swap the bytes we need to keep to the front of the buffer
        if n < self.lookahead.len() {
            keep = self.lookahead.len() - n;
            for i in 0..keep {
                self.lookahead.swap(i, i+n);
            }
        } else {
            keep = 0;
        }


        for &b in &self.lookahead[keep..] {
            if b == b'\n' {
                self.line += 1;
                self.column = 0;
            } else {
                self.column += 1;
            }
        }

        for _ in 0..n {
            self.lookahead.pop();
        }
    }

    /// Returns the position as (line, column)
    pub fn position(&self) -> (u32, u32) {
        (self.line, self.column)
    }
 }
	extern crate smallvec;

	use std::collections::VecDeque;
	use std::io::Read;

	use smallvec::SmallVec;

	pub type LexResult<T> = std::result::Result<T, Error>;

	/**
	* Provides functionality for implementing a lexer.
	*/
	#[derive(Clone)]
	pub struct Lexer<R: Read> {
	input: R,
	lookahead: SmallVec<[u8; 8]>,
	is_eof: bool,
	line: u32,
	column: u32,
	}

	impl<R: Read> Lexer<R> {
	pub fn new(input: R) -> Lexer<R> {
	Lexer {
	input: input,
	lookahead: SmallVec::new(),
	is_eof: false,
	line: 1,
	column: 0
	}
	}

	pub fn into_input(self) -> R {
	self.input
	}

	/// Read a single byte from the input
	pub fn read_byte(&mut self) -> LexResult<u8> {
	if self.lookahead.len() > 0 {
	let b = self.lookahead[0];
	self.bump_bytes(1);
	return Ok(b);
	} else if self.is_eof {
	return Err(Error::Eof);
	} else {
	let mut buf = [0u8];
	match self.input.read(&mut buf) {
	Ok(0) => {
	self.is_eof = true;
	return Err(Error::Eof);
	}
	Err(e) => return Err(Error::Io(e)),
	_ => ()
	}

	if buf[0] == b'\n' {
	self.line += 1;
	self.column = 0;
	} else {
	self.column += 1;
	}

	return Ok(buf[0]);
	}
	}

	/// Look at the next `n` bytes from the input. If there are not enough bytes, the returned
	/// slice may be smaller than `n`.
	pub fn peek_bytes(&mut self, n: usize) -> LexResult<&[u8]> {
	if n <= self.lookahead.len() {
	return Ok(&self.lookahead[0..n]);
	} else {
	let mut cur_len = self.lookahead.len();
	let to_read = n - cur_len;

	// Fill the extra space we need with 0
	for _ in 0..to_read {
	self.lookahead.push(0);
	}

	// Read the appropriate number of bytes from the input
	while cur_len < self.lookahead.len() {
	// Make the buffer
	let buf = &mut self.lookahead[cur_len..n];
	match self.input.read(buf) {
	// Read zero bytes, thats an eof, but don't treat it as an error, just stop reading
	Ok(0) => {
	self.is_eof = true;
	break;
	}
	// Read some bytes, bump the current length of the lookahead buffer
	Ok(n) => {
	cur_len += n;
	}
	Err(e) => return Err(Error::Io(e))
	}
	};

	// Return a slice into the lookahead buffer
	Ok(&self.lookahead[..cur_len])
	}
	}

	/// Try to read a character from the input.
	/// Returns None if a valid character cannot be read and input is not consumed.
	pub fn read_char(&mut self) -> LexResult<Option<char>> {
	let c = try!(self.peek_char());
	if let Some(c) = c {
	self.bump_bytes(c.len_utf8());
	}
	Ok(c)
	}

	/// Try to look a character from the input.
	/// Returns None if a valid character cannot be read.
	pub fn peek_char(&mut self) -> LexResult<Option<char>> {
	let c = {
	let bytes = try!(self.peek_bytes(6));
	let s = std::str::from_utf8(bytes);
	match s {
	Ok(s) if s.len() > 0 => {
	s.chars().nth(0).unwrap()
	}
	_ => {
	return Ok(None);
	}
	}
	};
	Ok(Some(c))
	}

	/// Matches the input against the given bytes, if they compare equal, the input is consumed and
	/// true is returned. Otherwise, false is returned.
	pub fn eat_bytes(&mut self, bytes: &[u8]) -> LexResult<bool> {
	let eq = {
	let look = try!(self.peek_bytes(bytes.len()));
	look == bytes
	};
	if eq {
	self.bump_bytes(bytes.len());
	Ok(true)
	} else {
	Ok(false)
	}
	}

	/// If a character read from the input matches the given character, consume the input and
	/// return true. Otherwise, false is returned.
	pub fn eat_char(&mut self, c: char) -> LexResult<bool> {
	let look = try!(self.peek_char());
	if look == Some(c) {
	self.bump_bytes(c.len_utf8());
	Ok(true)
	} else {
	Ok(false)
	}
	}

	#[inline(always)]
	pub fn eat_str(&mut self, s: &str) -> LexResult<bool> {
	self.eat_bytes(s.as_bytes())
	}

	/// Skip input until the first non-whitespace character. Whitespace is defined as a character
	/// with White_Space unicode property.
	pub fn skip_whitespace(&mut self) -> LexResult<()> {
	loop {
	if let Some(c) = try!(self.peek_char()) {
	if c.is_whitespace() {
	self.bump_bytes(c.len_utf8());
	} else {
	break;
	}
	}
	}

	Ok(())
	}

	/// Discard up to `n` bytes from the lookahead buffer
	pub fn bump_bytes(&mut self, n: usize) {
	let keep;
	// swap the bytes we need to keep to the front of the buffer
	if n < self.lookahead.len() {
	keep = self.lookahead.len() - n;
	for i in 0..keep {
	self.lookahead.swap(i, i+n);
	}
	} else {
	keep = 0;
	}


	for &b in &self.lookahead[keep..] {
	if b == b'\n' {
	self.line += 1;
	self.column = 0;
	} else {
	self.column += 1;
	}
	}

	for _ in 0..n {
	self.lookahead.pop();
	}
	}

	/// Returns the position as (line, column)
	pub fn position(&self) -> (u32, u32) {
	(self.line, self.column)
	}
	}