-
-
Save hroi/d95f32f281eccf1b7e3ecd2988fbad80 to your computer and use it in GitHub Desktop.
Code shared from the Rust Playground
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
pub struct Sentences<'a> { | |
pub inner: &'a str, | |
} | |
pub fn sentences(input: &str) -> Sentences { | |
Sentences { inner: input } | |
} | |
const TOK_WHITESPACE: u32 = 1; | |
const TOK_PERIOD: u32 = 1 << 1; | |
const TOK_UPPERCASE: u32 = 1 << 2; | |
const TOK_TEXT: u32 = 1 << 3; | |
const STOPWORDS: &[&str] = &[ | |
"Mr. ", "Mr.", | |
"Mrs. ", "Mrs.", | |
"Hr. ", "Hr.", | |
"Fr. ", "Fr.", | |
"f.eks. ", "f.eks.", | |
"f. eks. ", "f. eks.", | |
"jf. ", "jf.", | |
"bl.a. ", "bl.a.", | |
"bl. a. ", "bl. a.", | |
]; | |
const SENTENCE_BREAKERS: &[u32] = &[ | |
(TOK_TEXT << 15) | (TOK_PERIOD << 10) | (TOK_WHITESPACE << 5) | TOK_UPPERCASE, | |
(TOK_TEXT << 10) | (TOK_PERIOD << 5) | TOK_UPPERCASE, | |
(TOK_WHITESPACE << 10) | (TOK_PERIOD << 5) | TOK_UPPERCASE, | |
]; | |
impl<'a> Iterator for Sentences<'a> { | |
type Item = &'a str; | |
fn next(&mut self) -> Option<&'a str> { | |
let mut lookback = 0; | |
for (pos, ch) in self.inner.char_indices() { | |
//dbg!(&self.inner[..pos]); | |
let tok = match ch { | |
'.' => TOK_PERIOD, | |
_ if ch.is_uppercase() => TOK_UPPERCASE, | |
_ if ch.is_whitespace() => TOK_WHITESPACE, | |
',' => TOK_WHITESPACE, | |
_ => TOK_TEXT, | |
}; | |
if tok & lookback != tok { | |
lookback <<= 5; | |
lookback |= tok; | |
} | |
if SENTENCE_BREAKERS.iter().any(|sb| (*sb & lookback) == *sb) | |
&& !STOPWORDS | |
.iter() | |
.any(|stop| self.inner[..pos].ends_with(stop)) | |
{ | |
let (ret, next) = self.inner.split_at(pos); | |
self.inner = next; //.trim(); | |
return Some(ret); | |
} | |
} | |
match self.inner { | |
"" => None, | |
ret => { | |
self.inner = ""; | |
Some(ret) | |
} | |
} | |
} | |
} | |
fn main() { | |
//let text = "finally. there is.One, two. Three. \n four.\n This is the final sentence. "; | |
let text = r#" | |
Before Mr. Clensy left the _Catholot_ he told Sestrina to expect to see him | |
on board again that same night. | |
The _Catholot_ was supposed to sail next morning, so Mr.Clensy naturally | |
presumed that he could, at any rate, row out to her and see Sestrina | |
once more before she sailed. | |
That same night, Biglow, Clensy and Adams packed their few goods and got | |
all ready to clear out of Hayti. They had decided to take a boat from L— | |
and row out to the _Catholot_ after dark, get on board by some excuse | |
and then stow away. | |
That night, without delay, they hired the boat. | |
“If _one_ can stow away three can, eh, lad?” said Biglow, as they pulled | |
at the oars and got round by the bend of the harbour near S—. | |
In a few moments they had turned the point where they got a good view of | |
the harbour. | |
“Done! She’s sailed!” said Biglow in a mighty voice. | |
He nearly upset the boat as he stood up and stared over the waters of | |
the starlit harbour. | |
It was true enough, the _Catholot_ had sailed. Sestrina had gone from | |
Hayti! | |
"#; | |
for sentence in sentences(text) { | |
println!("sentence: {:?}", sentence) | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment