Skip to content

Instantly share code, notes, and snippets.

@kugland
Last active June 7, 2022 13:15
Show Gist options
  • Save kugland/71f405bf7bde3fafc1597e7082a085bc to your computer and use it in GitHub Desktop.
Save kugland/71f405bf7bde3fafc1597e7082a085bc to your computer and use it in GitHub Desktop.
Filter only printable characters from stdin. Like strings(1), but supporting UTF-8.
use std::io::{Read, Write, stdin, stdout};
enum State {
Normal,
Utf8(
/// total number of bytes in the sequence
usize,
/// number of bytes left to read
usize,
/// code point (accumulated)
u32,
),
}
fn is_printable_codepoint(cp: u32) -> bool {
// Cf. The Unicode Standard, Version 14.0, 23.7: "Noncharacters"
match cp {
// Ignore surrogate pairs
0xD800..=0xDFFF => false,
0xFDD0..=0xFDEF => false, // Arabic noncharacters
// The last two code points of each plane are noncharacters.
// (Anything ending in FFFE or FFFF)
_ if (cp & 0xFFFE) == 0xFFFE => false,
// Ignore anything above U+10FFFF
_ if cp > 0x10FFFF => false,
_ => true,
}
}
fn is_overlong_utf8(cp: u32, len: usize) -> bool {
cp < match len {
2 => 0x80,
3 => 0x800,
4 => 0x10000,
_ => unreachable!(),
}
}
fn main() {
let mut state = State::Normal;
let mut stdin = stdin().bytes();
let mut stdout = stdout();
while let Some(Ok(b)) = stdin.next() {
state = match b {
// ASCII
0x09..=0x0d | 0x20..=0x7e => {
stdout.write(&[b]).unwrap();
State::Normal
},
// UTF-8 (start byte for 2-byte sequence)
0b110_00000..=0b110_11111 => State::Utf8(2, 1, b as u32 & 0b11111),
// UTF-8 (start byte for 3-byte sequence)
0b1110_0000..=0b1110_1111 => State::Utf8(3, 2, b as u32 & 0b1111),
// UTF-8 (start byte for 4-byte sequence)
0b11110_000..=0b11110_111 => State::Utf8(4, 3, b as u32 & 0b111),
// UTF-8 (continuation byte)
0b10_000000..=0b10_111111 => match state {
State::Normal => State::Normal,
State::Utf8(len, rem, cp) => {
let rem = rem - 1;
let cp = (cp << 6) | ((b as u32) & 0b00_111111);
if rem > 0 {
// Still more bytes to read, just accumulate.
State::Utf8(len, rem, cp)
} else {
if is_printable_codepoint(cp) && !is_overlong_utf8(cp, len) {
// We can use from_u32_unchecked() because is_printable_codepoint()
// already checked that the code point wouldn't be greater than
// U+10FFFF, and also that it wouldn't be in the surrogate pair
// range. Those are the checks that Rust's std::char::from_u32()
// would perform.
let c = unsafe { char::from_u32_unchecked(cp) };
let mut buf = [0; 4];
c.encode_utf8(&mut buf);
stdout.write(&buf[..len]).unwrap();
}
State::Normal
}
}
},
_ => State::Normal,
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment