Skip to content

Instantly share code, notes, and snippets.

@Measter
Last active October 19, 2020 10:57
Show Gist options
  • Save Measter/e2e287ee21311d34ea8eb8cd9d5783be to your computer and use it in GitHub Desktop.
Save Measter/e2e287ee21311d34ea8eb8cd9d5783be to your computer and use it in GitHub Desktop.
// Because there's no kill like over-kill, right?
use std::{
collections::HashMap,
io::{BufRead, BufReader},
ffi::OsStr,
fs::File,
path::Path,
};
// These pesky Human languages are messy. This will help a bit.
use unicode_normalization::UnicodeNormalization;
use unicode_segmentation::UnicodeSegmentation;
// Rust doesn't have a recursive directory walker in stdlib. While writing an iterater-based one
// probably wouldn't be too complicated, I don't think it would be trivial.
// Plus, I'm already pulling in two dependencies, what's one more?
use walkdir::WalkDir;
#[derive(Ord, PartialOrd, Eq, PartialEq)]
struct WordCount {
word: String,
count: u32,
}
#[derive(Default)]
struct Buffers {
line: String,
normalise: String
}
fn process_file(file: &Path, word_counts: &mut HashMap<String, WordCount>, buffers: &mut Buffers) -> Result<(), std::io::Error> {
let file = File::open(file)?;
let mut reader = BufReader::new(file);
while reader.read_line(&mut buffers.line)? > 0 {
for word in buffers.line.split_word_bounds() {
buffers.normalise.clear();
buffers.normalise.extend(word.nfc().flat_map(|c| c.to_lowercase()));
if
buffers.normalise.graphemes(true).count() < 2
|| buffers.normalise.chars().any(|c| c.is_numeric() || c.is_control() || c.is_whitespace())
{
continue;
}
// This gets a little awkward, but the Entry syntax requires an owned
// key as an input, whereas here we only allocate when a new entry is needed,
// avoiding the allocation for every single word.
let count = if let Some(v) = word_counts.get_mut(&buffers.normalise) {
v
} else {
word_counts.insert(buffers.normalise.clone(), WordCount {
word: word.to_owned(),
count: 0
});
word_counts.get_mut(&buffers.normalise).unwrap()
};
count.count += 1;
}
buffers.line.clear();
}
Ok(())
}
fn main() {
let mut word_counts: HashMap<String, WordCount> = HashMap::new();
// Using these buffers allows us to re-use the allocations when reading a line,
// and when normalizing each word.
// We just need to remember to clear the buffers each time.
let mut buffers = Buffers::default();
for entry in WalkDir::new(".") {
let entry = match entry {
Ok(e) => e,
Err(err) => {
eprintln!("Error querying files: {}", err);
continue;
}
};
// Should do a case-insensitive extension test. Fortunately, this is limited to ASCII, making it simple.
if !entry.path().is_file() || !matches!(entry.path().extension().and_then(OsStr::to_str), Some(ext) if ext.eq_ignore_ascii_case("txt")) {
continue;
}
if let Err(err) = process_file(&entry.path(), &mut word_counts, &mut buffers) {
eprintln!("Error processing file: `{}`", entry.path().display());
eprintln!("{}", err);
}
}
let mut word_array: Vec<_> = word_counts.into_iter()
.map(|(_, count)| count)
.collect();
// Must remember to sort our array. What kind of pillock would do that?
word_array.sort_by_key(|w| w.count);
for word in word_array.iter().rev().take(10) {
println!("{} {}", word.count, word.word);
}
}
yạ̇y yạ̇y ẽ̗
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment