Last active
July 3, 2024 05:51
-
-
Save nikhilr612/6ae60d0edcf97913006b7242a14e3d1c to your computer and use it in GitHub Desktop.
An implementation of a UTF-8 character stream from a reader in Rust.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
use std::io::Result as IoResult; | |
const BYTEBUF_SIZE: usize = 2048; | |
/// Buffered Character streaming for `Read` streams. | |
pub struct CharStream<'a, T: Read> { | |
buffer: [u8; BYTEBUF_SIZE], | |
/// The offset to start writing into the byte buffer from. | |
bstart: usize, | |
/// The buffer to store chars. | |
chars: String, | |
/// The byte offset from which the next character in string can be read. | |
offset: usize, | |
/// The number of bytes read. | |
position: usize, | |
/// The last character read. Initially set to None. | |
last: Option<char>, | |
reader: &'a mut T | |
} | |
impl<'a, T: Read> From<&'a mut T> for CharStream<'a, T> { | |
fn from(a: &'a mut T) -> Self { | |
CharStream { | |
chars: String::new(), | |
offset: 0, position: 0, | |
bstart: 0, | |
reader: a, | |
last: None, | |
buffer: [0; BYTEBUF_SIZE] | |
} | |
} | |
} | |
impl<T: Read> CharStream<'_, T> { | |
/// Read next character, if possible. Returns Ok(None) when no characters are left. | |
/// As with `Read`, once None is returned, subsequent calls may be __non-null__, once the stream has taken more data. | |
/// Returns Err if stream contains invalid utf-8 data. | |
pub fn next_char(&mut self) -> IoResult<Option<char>> { | |
if self.offset == self.chars.len() { | |
// All contents of buffer have been read. | |
if !self.fill_buffer()? { | |
// Can't refil, so we're done. | |
return Ok(None); | |
} | |
} | |
let r = self.chars[self.offset..].chars().next().inspect(|ch| { | |
self.offset += ch.len_utf8(); | |
}); | |
self.last.clone_from(&r); | |
Ok(r) | |
} | |
// Return true if any chars were translated, false otherwise. | |
fn fill_buffer(&mut self) -> IoResult<bool>{ | |
let nread = self.reader.read(&mut self.buffer[self.bstart..])?; | |
self.position += nread; | |
let blen = nread + self.bstart; | |
self.chars.clear(); | |
self.offset = 0; | |
// No characters left.. stream has most likely reached the end. | |
if blen == 0 { | |
return Ok(false); | |
} | |
let (s, v) = match std::str::from_utf8(&self.buffer[..blen]) { | |
Ok(s) => (s, blen), | |
Err(e) => { | |
if e.valid_up_to() == 0 { | |
// Unlikely that nonthing in the buffer could be translated, unless the buffer was empty. | |
// But that's handled. | |
return Err(std::io::Error::new(std::io::ErrorKind::InvalidData, "Invalid UTF-8 data.")); | |
} else { | |
// (blen - valid_up_to) bytes were not translated, so don't discard them. | |
// If nothing was translated, then .. we have an error. | |
(unsafe { std::str::from_utf8_unchecked(&self.buffer[..e.valid_up_to()]) }, e.valid_up_to()) | |
} | |
} | |
}; | |
self.chars.push_str(s); | |
self.buffer.copy_within(v.., 0); | |
self.bstart = blen - v; | |
Ok(true) | |
} | |
/// Get the number of bytes read thus far. | |
pub fn byte_position(&self) -> usize { | |
self.position | |
} | |
/// Get the most recent character returned by `next_char`. | |
/// If `next_char` has never been called, returns `None`. | |
pub fn last_char(&self) -> Option<char> { | |
self.last | |
} | |
} | |
impl<'a, T: Read> Iterator for CharStream<'a, T> { | |
type Item = IoResult<char>; | |
fn next(&mut self) -> std::option::Option<<Self as std::iter::Iterator>::Item> { | |
self.next_char().transpose() | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment