Created
January 24, 2022 18:58
-
-
Save sekhat/3406a54d5171902c01bac4efec3f8623 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
use std::iter::Peekable; | |
use std::num::NonZeroUsize; | |
/// A text token returned from the tokenizer | |
#[derive(Clone)] | |
enum TextToken<'a> { | |
/// The contained string represents a string of text characters | |
Text(&'a str), | |
/// The contained string represents a string of text characters | |
/// that aren't allowed to be broken on to the next line | |
DontBreak(&'a str), | |
/// Represents a string of characters that are all whitespace | |
Whitespace(&'a str), | |
} | |
/// Determines if `c` is a whitespace character | |
fn is_whitespace(c: char) -> bool { | |
c == ' ' || c == '\t' || c == '\r' || c == '\n' | |
} | |
/// The Tokenizer | |
struct Tokenizer<'a, I> { | |
/// The input | |
input: I, | |
/// The current string being processed | |
current: Option<&'a str>, | |
} | |
impl<'a, I> Tokenizer<'a, I> | |
where | |
I: Iterator<Item = &'a str>, | |
{ | |
/// Create a new `Tokenizer` fron an iterator | |
fn new<V>(input: V) -> Self | |
where | |
V: IntoIterator<IntoIter = I> | |
{ | |
Self { | |
input: input.into_iter(), | |
current: None, | |
} | |
} | |
} | |
// allow our tokenizer to be cloned if it's containing iterator is also Clone | |
impl<'a, I> Clone for Tokenizer<'a, I> | |
where | |
I: Iterator + Clone, | |
{ | |
fn clone(&self) -> Self { | |
Self { | |
input: self.input.clone(), | |
current: self.current.clone(), | |
} | |
} | |
} | |
impl<'a, I> Iterator for Tokenizer<'a, I> | |
where | |
I: Iterator<Item = &'a str>, | |
{ | |
type Item = TextToken<'a>; | |
fn next(&mut self) -> Option<Self::Item> { | |
// if we have a current string use that to read from | |
// otherwise read the next string from our iterator | |
let string = match self.current.take() { | |
Some(string) => string, | |
None => self.input.next()?, | |
}; | |
// make sure we can read through the character and there string | |
// indices | |
let mut chars = string.char_indices().peekable(); | |
// check if our first character is whitespace or not, so we can | |
// determine if we are reading a run of whitespace characters or text | |
// characters | |
let is_matching_whitespace = match chars.peek() { | |
Some((_, c)) => is_whitespace(*c), | |
None => return self.next(), | |
}; | |
// figure out the end index of our current run | |
let end_index = chars | |
.filter(|(_, c)| is_whitespace(*c) != is_matching_whitespace) | |
.map(|(i, _)| i) | |
.next(); | |
// figure out which part of our string is the result | |
let result = match end_index { | |
Some(index) => &string[..index], | |
None => string, | |
}; | |
// if there is any remaining string set that to current so it's used | |
// next loop | |
self.current = match end_index { | |
Some(index) => Some(&string[index..]), | |
None => None, | |
}; | |
// and return our correct result | |
if is_matching_whitespace { | |
Some(TextToken::Whitespace(result)) | |
} else { | |
Some(TextToken::Text(result)) | |
} | |
} | |
} | |
/// Represents a token outputted from the TextLayout | |
#[derive(Debug)] | |
enum LayoutToken<'a> { | |
/// The contained string of text | |
Text(&'a str), | |
/// A space between words | |
Space, | |
/// A new line | |
Newline, | |
} | |
/// Represents the items that can be pushed back | |
enum PushBack<'a> { | |
/// A text token to be used instead of the next item from the tokenizer | |
Token(TextToken<'a>), | |
/// The TextLayout should output a new line, then the next text token | |
/// to process should be `TextToken` contained | |
NewlineThen(TextToken<'a>), | |
/// The TextLayout should output the following `LayoutToken` next | |
LayoutToken(LayoutToken<'a>), | |
/// There is nothing in the push back | |
None, | |
} | |
impl<'a> PushBack<'a> { | |
/// Take the item out of the PushBack, returning it to the caller | |
/// and leaving the existing push back as None | |
fn take(&mut self) -> PushBack<'a> { | |
let mut result = PushBack::None; | |
core::mem::swap(&mut result, self); | |
result | |
} | |
} | |
/// Split a `&str` at the given utf-8 character index | |
fn split_at_char_index(input: &str, index: usize) -> (&str, &str) { | |
match input.char_indices().skip(index).next() { | |
Some((index, _)) => input.split_at(index), | |
None => (input, ""), | |
} | |
} | |
struct TextLayout<'a, I> | |
where | |
I: Iterator<Item = &'a str>, | |
{ | |
/// The character count that words will be wrapped at | |
width: usize, | |
/// The current position into the current line | |
current: usize, | |
/// Hold the push back state | |
push_back: PushBack<'a>, | |
/// The tokenizer to read `TextToken`s from | |
tokenizer: Peekable<Tokenizer<'a, I>>, | |
} | |
impl<'a, I> TextLayout<'a, I> | |
where | |
I: Iterator<Item = &'a str>, | |
{ | |
/// Creates a new `TextLayout`, given the expected iterator and non-zero | |
/// line width to wrap words at. | |
fn new<V>(input: V, width: NonZeroUsize) -> Self | |
where | |
V: IntoIterator<IntoIter = I>, | |
{ | |
Self { | |
width: width.get(), | |
current: 0, | |
push_back: PushBack::None, | |
tokenizer: Tokenizer::new(input.into_iter()).peekable(), | |
} | |
} | |
} | |
impl<'a, I> Iterator for TextLayout<'a, I> | |
where | |
I: Iterator<Item = &'a str> + Clone, | |
{ | |
type Item = LayoutToken<'a>; | |
fn next(&mut self) -> Option<Self::Item> { | |
let token = match self.push_back.take() { | |
// If there's no item on the push back, read a TextToken from | |
// the tokenizer | |
PushBack::None => self.tokenizer.next()?, | |
// If there's a text token on the push back, use that | |
PushBack::Token(token) => token, | |
// If we are expecting to output a newline, do that and prepare | |
// the next text token | |
PushBack::NewlineThen(token) => { | |
// we first update the push back to be a token for next | |
// time `next()` is called | |
self.push_back = PushBack::Token(token); | |
// reset the line index | |
self.current = 0; | |
// and return a Newline | |
return Some(LayoutToken::Newline); | |
} | |
// If we are expecting a layout token just return that | |
PushBack::LayoutToken(token) => return Some(token), | |
}; | |
// are we allowed to push the current token on to the next line? | |
let allow_next_line = match token { | |
TextToken::DontBreak(_) => false, | |
_ => true, | |
}; | |
match token { | |
// Both DontBreak and Text means a string of text | |
// DontBreak just means, you aren't allowed to push the word | |
// on to the next line hence the allow_next_line variable above | |
TextToken::DontBreak(text) | TextToken::Text(text) => { | |
// | |
// Multiple word tokens in a row means it's likely all one word | |
// so we need to add up all the sizes to make sure we make | |
// the correct decision | |
// We clone the tokenizer here, so we can advance that iterator | |
// seperately from our own. | |
// | |
// As long as the underlying iterator that tokenizer depends on | |
// doesn't require allocating to clone, no allocation will occur | |
// | |
// But that at least leaves it in the users hands | |
let mut find_end_iter = self.tokenizer.clone(); | |
// The length of just this part of the text | |
let part_length = text.chars().count(); | |
// calculate the full length of the current word | |
let mut length = part_length; | |
for item in find_end_iter { | |
match item { | |
TextToken::DontBreak(text) | | |
TextToken::Text(text) => { | |
length += text.chars().count() | |
}, | |
_ => break | |
} | |
} | |
// So we have a part, which may be smaller than the word | |
// we need to decide new line base on the whole word, | |
// though allow_next_line changes that behaviour | |
let at_start_of_line = self.current == 0; | |
let end_line_index = self.current + length; | |
let word_will_overflow = end_line_index > self.width; | |
let part_will_overflow = | |
self.current + part_length > self.width; | |
match ( | |
at_start_of_line, | |
word_will_overflow, | |
allow_next_line, | |
part_will_overflow, | |
) { | |
(false, true, false, false) | (true, true, _, false) => { | |
// Force that the next token won't allow being pushed | |
// onto the next line, since we are only part of the | |
// whole word | |
self.push_back = match self.tokenizer.next() { | |
Some(TextToken::Text(text)) | |
| Some(TextToken::DontBreak(text)) => { | |
PushBack::Token(TextToken::DontBreak(text)) | |
} | |
Some(TextToken::Whitespace(text)) => { | |
PushBack::Token(TextToken::Whitespace(text)) | |
} | |
None => PushBack::None, | |
}; | |
self.current += text.len(); | |
Some(LayoutToken::Text(text)) | |
} | |
(false, true, false, true) | (true, true, _, true) => { | |
// allow_next_line is irrelevant here as we are | |
// already at the start of a line | |
// | |
// however, if out part is smaller than the overflow | |
// we just want to return that and make the next | |
// token DontBreak | |
// we know that we will over flow, so it's just | |
// the remaining size of the line we need | |
let valid_char_count = self.width - self.current; | |
let (left, right) = | |
split_at_char_index(text, valid_char_count); | |
if right.len() > 0 { | |
self.push_back = | |
PushBack::NewlineThen(TextToken::Text(right)); | |
} | |
self.current += valid_char_count; | |
Some(LayoutToken::Text(left)) | |
} | |
(_, false, _, _) => { | |
// We are at start of the line, and we know we wont | |
// overflow so just return text | |
self.current += text.len(); | |
Some(LayoutToken::Text(text)) | |
} | |
(false, true, true, _) => { | |
// So we aren't at the start of the line, but the word | |
// is allowed to be pushed to the next line, it will | |
// overflow, so we need to return a newline so we can | |
// just push back the token and output the newline | |
// | |
// Doing this should trigger the at start_of_line handlers | |
self.push_back = PushBack::Token(TextToken::Text(text)); | |
self.current = 0; | |
Some(LayoutToken::Newline) | |
} | |
} | |
} | |
TextToken::Whitespace(_) => { | |
// we can assume getting this far that push_back is now None | |
// consume all our whitespace tokens | |
while let Some(TextToken::Whitespace(_)) = | |
self.tokenizer.peek() | |
{ | |
// fine returning early if we reach none while in | |
// white space | |
self.tokenizer.next()?; | |
} | |
// If we are at the end of the current line however, force | |
// a new line | |
let at_start_of_line = self.current == 0; | |
// only increase current if not at start of line, so that | |
// spaces are trimmed at start of line, but simulated otherwise | |
self.current += if at_start_of_line { 0 } else { 1 }; | |
// if we get none next then we can short circuit return with | |
// None, as it has the same result as trimming whitespace | |
// at the end | |
let next_token = self.next()?; | |
match (at_start_of_line, next_token) { | |
// If we have a new line as our next token we can just | |
// return that. | |
(_, LayoutToken::Newline) => Some(LayoutToken::Newline), | |
// if we are at start of line then we can do the same | |
// with text | |
(true, LayoutToken::Text(text)) => { | |
Some(LayoutToken::Text(text)) | |
} | |
// if we aren't at start of line we should probably output | |
// our space so it's no longer simulated, but we need to | |
// make sure next run round is the text we got, so we | |
// put that on the push back | |
(false, LayoutToken::Text(text)) => { | |
self.push_back = | |
PushBack::LayoutToken(LayoutToken::Text(text)); | |
Some(LayoutToken::Space) | |
} | |
// if we get a second lot of white space, there might be | |
// even more, so consume all the white space tokens from | |
// the tokenizer before returning our space | |
(_, LayoutToken::Space) => unreachable!() | |
} | |
} | |
} | |
} | |
} | |
fn layout_and_print<'a, I>(iterator: I, width: usize) | |
where | |
I: IntoIterator<Item = &'a str>, | |
I::IntoIter: Clone, | |
{ | |
let width = NonZeroUsize::new(width).unwrap(); | |
let text_layout = TextLayout::new(iterator, width); | |
for item in text_layout { | |
match item { | |
LayoutToken::Text(text) => print!("{}", text), | |
LayoutToken::Space => print!(" "), | |
LayoutToken::Newline => println!(""), | |
} | |
} | |
} | |
fn main() { | |
println!("first"); | |
layout_and_print(["x---xy---yz--z"], 5); | |
println!("\n\nsecond"); | |
layout_and_print(["x---xy-", "--yz", "--z"], 5); | |
println!("\n\nthird"); | |
layout_and_print(["12345 ", " 67890", "12345", "1234567890"], 5); | |
println!("\n\ndone"); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment