Last active
October 14, 2023 20:54
-
-
Save passivedragon/673bf6898a12385075d28d5ea96a5f37 to your computer and use it in GitHub Desktop.
wc coding challenge DevOps
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env -S cargo +nightly -Zscript | |
//! ```cargo | |
//! [package] | |
//! authors = ["passivedragon"] | |
//! version = "0.0.1" | |
//! edition = "2021" | |
//! [dependencies] | |
//! clap = { version = "4.2", features = ["derive"] } | |
//! regex = "1.9.6" | |
//! unicode-segmentation = "1.10.1" | |
//! rayon = "1.8" | |
//! cli-table = "0.4.7" | |
//! ``` | |
use std::collections::hash_map::HashMap; | |
use clap::Parser; | |
use rayon::prelude::*; | |
#[derive(Parser, Debug)] | |
#[clap(version, long_about)] | |
/// a wc replacement written as an exercise for DevOps coding challenges | |
/// | |
/// the order of numbers returned when requested, is always: | |
/// newlines, words, characters, bytes | |
struct Args { | |
// #[clap(short, long, help = "Path to config")] | |
// config: Option<std::path::PathBuf>, | |
#[clap(short = 'l', long = "lines", help = "print newline counts")] | |
newlines: bool, | |
#[clap(short = 'w', long = "words", help = "print word counts")] | |
words: bool, | |
#[clap(short = 'm', long = "chars", help = "print character counts")] | |
characters: bool, | |
#[clap(short = 'c', long = "bytes", help = "print byte counts")] | |
bytes: bool, | |
#[clap(help = "paths of files to look at")] | |
files: Option<Vec<std::path::PathBuf>> | |
} | |
#[derive(Eq, Hash, PartialEq, Debug)] | |
enum CountType { | |
NEWLINES, | |
WORDS, | |
CHARS, | |
BYTES, | |
WasLastWord, // used for word counting, keeps track of if the last looked at character was a word character | |
} | |
type CountMap = HashMap<CountType, usize>; | |
use cli_table::{format::Justify, print_stdout, Table, WithTitle}; | |
#[derive(Table, Default)] | |
struct Count { | |
#[table(title = "newlines", justify = "Justify::Right")] | |
newlines: usize, | |
#[table(title = "words", justify = "Justify::Right")] | |
words: usize, | |
#[table(title = "chars", justify = "Justify::Right")] | |
chars: usize, | |
#[table(title = "bytes", justify = "Justify::Right")] | |
bytes: usize, | |
#[table(title = "bytes", justify = "Justify::Right")] | |
was_last_word: bool, | |
#[table(title = "source")] | |
origin: String, | |
} | |
fn print_results(args: &Args, count: &[Count]){ | |
let mut s: String = Default::default(); | |
/* | |
if args.newlines { | |
s.push_str(&format!("{}\t", count[&CountType::NEWLINES])); | |
} | |
if args.words { | |
s.push_str(&format!("{}\t", count[&CountType::WORDS])); | |
} | |
if args.characters { | |
s.push_str(&format!("{}\t", count[&CountType::CHARS])); | |
} | |
if args.bytes { | |
s.push_str(&format!("{}\t", count[&CountType::BYTES])); | |
} | |
*/ | |
// println!("{}{1}", s, count.origin); | |
let _ = print_stdout(count.with_title()); | |
} | |
fn count_from_file(args: &Args, path: &std::path::PathBuf) -> Result<Count, Box<dyn std::error::Error + 'static>> { | |
use std::io::{Read, BufReader}; | |
use std::str::from_utf8; | |
let file = &std::fs::File::open(path)?; | |
const LIMIT: usize = 512*8; | |
// let mut handle = file.take(LIMIT.try_into().unwrap()); | |
let mut handle = BufReader::new(file); | |
let mut count: Count = Default::default(); | |
let mut buf: [u8; LIMIT] = [0; LIMIT]; | |
loop { | |
let read_bytes = handle.read(&mut buf[..]).unwrap(); | |
if 0 == read_bytes { | |
break; // reached EOF | |
} | |
let s = match from_utf8(&buf[0..read_bytes]) { | |
Ok(s) => s, | |
Err(e) => { | |
if read_bytes == 0 { | |
panic!("failed to read"); | |
} | |
// println!("caught multibyte"); | |
let _ = handle.seek_relative(- <usize as TryInto<i64>>::try_into(LIMIT-e.valid_up_to()).unwrap()); | |
from_utf8(&buf[0..e.valid_up_to()]).unwrap() | |
} | |
}; | |
count_from_string(&args, &s, &mut count); // could use from_utf8_unchecked instead | |
} | |
return Ok(count); | |
} | |
fn count_from_string(args: &Args, s: &str, count: &mut Count) { | |
if args.newlines { | |
count.newlines += s.matches('\n').count(); | |
} | |
if args.words { | |
use regex::Regex; | |
use unicode_segmentation::UnicodeSegmentation; | |
let is_word = Regex::new(r"\S+").unwrap(); // wc doesn't check like "\w+", so this is for compatibility | |
let mut words = is_word.find_iter(s).collect::<Vec<_>>().len(); | |
let graphemes = s.graphemes(true).collect::<Vec<&str>>(); | |
if is_word.captures(graphemes.first().unwrap()).is_some() && count.was_last_word { | |
words -= 1; | |
} | |
if is_word.captures(&graphemes.last().unwrap()).is_some() { | |
count.was_last_word = true; | |
} else { | |
count.was_last_word = false; | |
} | |
count.words += words; | |
} | |
if args.characters { | |
count.chars += s.chars().count(); | |
} | |
if args.bytes { | |
count.bytes += s.len(); | |
} | |
} | |
fn main() { | |
let args = Args::parse(); | |
// println!("{:?}", args); | |
if let Some(ref files) = args.files { | |
let results: Vec<Count> = files.par_iter() | |
.map(|i|{ | |
let mut count = count_from_file(&args, &i).unwrap(); | |
count.origin = format!("{:?}", i); | |
count | |
}).collect(); | |
print_results(&args, &results); | |
} else { | |
// might be getting input from stdin | |
let mut buf = String::new(); | |
let stdin = std::io::stdin(); | |
let mut count: Count = Default::default(); | |
loop { | |
let res = stdin.read_line(&mut buf); | |
if res.is_ok() && 0 == res.unwrap() { | |
// reached EOF | |
break; | |
} | |
count_from_string(&args, &buf, &mut count); | |
buf.clear(); | |
} | |
count.origin = "stdin".to_owned(); | |
print_results(&args, &[count]); | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment