Last active
October 19, 2020 10:57
-
-
Save Measter/e2e287ee21311d34ea8eb8cd9d5783be to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// Because there's no kill like over-kill, right? | |
use std::{ | |
collections::HashMap, | |
io::{BufRead, BufReader}, | |
ffi::OsStr, | |
fs::File, | |
path::Path, | |
}; | |
// These pesky Human languages are messy. This will help a bit. | |
use unicode_normalization::UnicodeNormalization; | |
use unicode_segmentation::UnicodeSegmentation; | |
// Rust doesn't have a recursive directory walker in stdlib. While writing an iterater-based one | |
// probably wouldn't be too complicated, I don't think it would be trivial. | |
// Plus, I'm already pulling in two dependencies, what's one more? | |
use walkdir::WalkDir; | |
#[derive(Ord, PartialOrd, Eq, PartialEq)] | |
struct WordCount { | |
word: String, | |
count: u32, | |
} | |
#[derive(Default)] | |
struct Buffers { | |
line: String, | |
normalise: String | |
} | |
fn process_file(file: &Path, word_counts: &mut HashMap<String, WordCount>, buffers: &mut Buffers) -> Result<(), std::io::Error> { | |
let file = File::open(file)?; | |
let mut reader = BufReader::new(file); | |
while reader.read_line(&mut buffers.line)? > 0 { | |
for word in buffers.line.split_word_bounds() { | |
buffers.normalise.clear(); | |
buffers.normalise.extend(word.nfc().flat_map(|c| c.to_lowercase())); | |
if | |
buffers.normalise.graphemes(true).count() < 2 | |
|| buffers.normalise.chars().any(|c| c.is_numeric() || c.is_control() || c.is_whitespace()) | |
{ | |
continue; | |
} | |
// This gets a little awkward, but the Entry syntax requires an owned | |
// key as an input, whereas here we only allocate when a new entry is needed, | |
// avoiding the allocation for every single word. | |
let count = if let Some(v) = word_counts.get_mut(&buffers.normalise) { | |
v | |
} else { | |
word_counts.insert(buffers.normalise.clone(), WordCount { | |
word: word.to_owned(), | |
count: 0 | |
}); | |
word_counts.get_mut(&buffers.normalise).unwrap() | |
}; | |
count.count += 1; | |
} | |
buffers.line.clear(); | |
} | |
Ok(()) | |
} | |
fn main() { | |
let mut word_counts: HashMap<String, WordCount> = HashMap::new(); | |
// Using these buffers allows us to re-use the allocations when reading a line, | |
// and when normalizing each word. | |
// We just need to remember to clear the buffers each time. | |
let mut buffers = Buffers::default(); | |
for entry in WalkDir::new(".") { | |
let entry = match entry { | |
Ok(e) => e, | |
Err(err) => { | |
eprintln!("Error querying files: {}", err); | |
continue; | |
} | |
}; | |
// Should do a case-insensitive extension test. Fortunately, this is limited to ASCII, making it simple. | |
if !entry.path().is_file() || !matches!(entry.path().extension().and_then(OsStr::to_str), Some(ext) if ext.eq_ignore_ascii_case("txt")) { | |
continue; | |
} | |
if let Err(err) = process_file(&entry.path(), &mut word_counts, &mut buffers) { | |
eprintln!("Error processing file: `{}`", entry.path().display()); | |
eprintln!("{}", err); | |
} | |
} | |
let mut word_array: Vec<_> = word_counts.into_iter() | |
.map(|(_, count)| count) | |
.collect(); | |
// Must remember to sort our array. What kind of pillock would do that? | |
word_array.sort_by_key(|w| w.count); | |
for word in word_array.iter().rev().take(10) { | |
println!("{} {}", word.count, word.word); | |
} | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
yạ̇y yạ̇y ẽ̗ |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment