Last active
June 7, 2022 13:15
-
-
Save kugland/71f405bf7bde3fafc1597e7082a085bc to your computer and use it in GitHub Desktop.
Filter only printable characters from stdin. Like strings(1), but supporting UTF-8.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
use std::io::{Read, Write, stdin, stdout}; | |
enum State { | |
Normal, | |
Utf8( | |
/// total number of bytes in the sequence | |
usize, | |
/// number of bytes left to read | |
usize, | |
/// code point (accumulated) | |
u32, | |
), | |
} | |
fn is_printable_codepoint(cp: u32) -> bool { | |
// Cf. The Unicode Standard, Version 14.0, 23.7: "Noncharacters" | |
match cp { | |
// Ignore surrogate pairs | |
0xD800..=0xDFFF => false, | |
0xFDD0..=0xFDEF => false, // Arabic noncharacters | |
// The last two code points of each plane are noncharacters. | |
// (Anything ending in FFFE or FFFF) | |
_ if (cp & 0xFFFE) == 0xFFFE => false, | |
// Ignore anything above U+10FFFF | |
_ if cp > 0x10FFFF => false, | |
_ => true, | |
} | |
} | |
fn is_overlong_utf8(cp: u32, len: usize) -> bool { | |
cp < match len { | |
2 => 0x80, | |
3 => 0x800, | |
4 => 0x10000, | |
_ => unreachable!(), | |
} | |
} | |
fn main() { | |
let mut state = State::Normal; | |
let mut stdin = stdin().bytes(); | |
let mut stdout = stdout(); | |
while let Some(Ok(b)) = stdin.next() { | |
state = match b { | |
// ASCII | |
0x09..=0x0d | 0x20..=0x7e => { | |
stdout.write(&[b]).unwrap(); | |
State::Normal | |
}, | |
// UTF-8 (start byte for 2-byte sequence) | |
0b110_00000..=0b110_11111 => State::Utf8(2, 1, b as u32 & 0b11111), | |
// UTF-8 (start byte for 3-byte sequence) | |
0b1110_0000..=0b1110_1111 => State::Utf8(3, 2, b as u32 & 0b1111), | |
// UTF-8 (start byte for 4-byte sequence) | |
0b11110_000..=0b11110_111 => State::Utf8(4, 3, b as u32 & 0b111), | |
// UTF-8 (continuation byte) | |
0b10_000000..=0b10_111111 => match state { | |
State::Normal => State::Normal, | |
State::Utf8(len, rem, cp) => { | |
let rem = rem - 1; | |
let cp = (cp << 6) | ((b as u32) & 0b00_111111); | |
if rem > 0 { | |
// Still more bytes to read, just accumulate. | |
State::Utf8(len, rem, cp) | |
} else { | |
if is_printable_codepoint(cp) && !is_overlong_utf8(cp, len) { | |
// We can use from_u32_unchecked() because is_printable_codepoint() | |
// already checked that the code point wouldn't be greater than | |
// U+10FFFF, and also that it wouldn't be in the surrogate pair | |
// range. Those are the checks that Rust's std::char::from_u32() | |
// would perform. | |
let c = unsafe { char::from_u32_unchecked(cp) }; | |
let mut buf = [0; 4]; | |
c.encode_utf8(&mut buf); | |
stdout.write(&buf[..len]).unwrap(); | |
} | |
State::Normal | |
} | |
} | |
}, | |
_ => State::Normal, | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment