kugland · June 7, 2022 13:15
diff --git a/printable.rs b/printable.rs
 use std::io::{Read, Write, stdin, stdout};

 enum State {
    Normal,
    Utf8(
        /// total number of bytes in the sequence
        usize,
        /// number of bytes left to read
        usize,
        /// code point (accumulated)
        u32,
    ),
 }

 fn is_printable_codepoint(cp: u32) -> bool {
    // Cf. The Unicode Standard, Version 14.0, 23.7: "Noncharacters"
    match cp {
        // Ignore surrogate pairs
        0xD800..=0xDFFF => false,
        0xFDD0..=0xFDEF => false, // Arabic noncharacters
        // The last two code points of each plane are noncharacters.
        // (Anything ending in FFFE or FFFF)
        _ if (cp & 0xFFFE) == 0xFFFE => false,
        // Ignore anything above U+10FFFF
        _ if cp > 0x10FFFF => false,
        _ => true,
    }
 }

 fn is_overlong_utf8(cp: u32, len: usize) -> bool {
    cp < match len {
        2 => 0x80,
        3 => 0x800,
        4 => 0x10000,
        _ => unreachable!(),
    } 
 }

 fn main() {
    let mut state = State::Normal;
    let mut stdin = stdin().bytes();
    let mut stdout = stdout();
    while let Some(Ok(b)) = stdin.next() {
        state = match b {
            // ASCII
            0x09..=0x0d | 0x20..=0x7e => {
                stdout.write(&[b]).unwrap();
                State::Normal
            },
            // UTF-8 (start byte for 2-byte sequence)
            0b110_00000..=0b110_11111 => State::Utf8(2, 1, b as u32 & 0b11111),
            // UTF-8 (start byte for 3-byte sequence)
            0b1110_0000..=0b1110_1111 => State::Utf8(3, 2, b as u32 & 0b1111),
            // UTF-8 (start byte for 4-byte sequence)
            0b11110_000..=0b11110_111 => State::Utf8(4, 3, b as u32 & 0b111),
            // UTF-8 (continuation byte)
            0b10_000000..=0b10_111111 => match state {
                State::Normal => State::Normal,
                State::Utf8(len, rem, cp) => {
                    let rem = rem - 1;
                    let cp = (cp << 6) | ((b as u32) & 0b00_111111);
                    if rem > 0 {
                        // Still more bytes to read, just accumulate.
                        State::Utf8(len, rem, cp)
                    } else {
                        if is_printable_codepoint(cp) && !is_overlong_utf8(cp, len) {
                            // We can use from_u32_unchecked() because is_printable_codepoint()
                            // already checked that the code point wouldn't be greater than
                            // U+10FFFF, and also that it wouldn't be in the surrogate pair
                            // range. Those are the checks that Rust's std::char::from_u32()
                            // would perform.
                            let c = unsafe { char::from_u32_unchecked(cp) };
                            let mut buf = [0; 4];
                            c.encode_utf8(&mut buf);
                            stdout.write(&buf[..len]).unwrap();
                        }
                        State::Normal
                    }
                }
            },
            _ => State::Normal,
        }
    }
 }
	use std::io::{Read, Write, stdin, stdout};

	enum State {
	Normal,
	Utf8(
	/// total number of bytes in the sequence
	usize,
	/// number of bytes left to read
	usize,
	/// code point (accumulated)
	u32,
	),
	}

	fn is_printable_codepoint(cp: u32) -> bool {
	// Cf. The Unicode Standard, Version 14.0, 23.7: "Noncharacters"
	match cp {
	// Ignore surrogate pairs
	0xD800..=0xDFFF => false,
	0xFDD0..=0xFDEF => false, // Arabic noncharacters
	// The last two code points of each plane are noncharacters.
	// (Anything ending in FFFE or FFFF)
	_ if (cp & 0xFFFE) == 0xFFFE => false,
	// Ignore anything above U+10FFFF
	_ if cp > 0x10FFFF => false,
	_ => true,
	}
	}

	fn is_overlong_utf8(cp: u32, len: usize) -> bool {
	cp < match len {
	2 => 0x80,
	3 => 0x800,
	4 => 0x10000,
	_ => unreachable!(),
	}
	}

	fn main() {
	let mut state = State::Normal;
	let mut stdin = stdin().bytes();
	let mut stdout = stdout();
	while let Some(Ok(b)) = stdin.next() {
	state = match b {
	// ASCII
	0x09..=0x0d \| 0x20..=0x7e => {
	stdout.write(&[b]).unwrap();
	State::Normal
	},
	// UTF-8 (start byte for 2-byte sequence)
	0b110_00000..=0b110_11111 => State::Utf8(2, 1, b as u32 & 0b11111),
	// UTF-8 (start byte for 3-byte sequence)
	0b1110_0000..=0b1110_1111 => State::Utf8(3, 2, b as u32 & 0b1111),
	// UTF-8 (start byte for 4-byte sequence)
	0b11110_000..=0b11110_111 => State::Utf8(4, 3, b as u32 & 0b111),
	// UTF-8 (continuation byte)
	0b10_000000..=0b10_111111 => match state {
	State::Normal => State::Normal,
	State::Utf8(len, rem, cp) => {
	let rem = rem - 1;
	let cp = (cp << 6) \| ((b as u32) & 0b00_111111);
	if rem > 0 {
	// Still more bytes to read, just accumulate.
	State::Utf8(len, rem, cp)
	} else {
	if is_printable_codepoint(cp) && !is_overlong_utf8(cp, len) {
	// We can use from_u32_unchecked() because is_printable_codepoint()
	// already checked that the code point wouldn't be greater than
	// U+10FFFF, and also that it wouldn't be in the surrogate pair
	// range. Those are the checks that Rust's std::char::from_u32()
	// would perform.
	let c = unsafe { char::from_u32_unchecked(cp) };
	let mut buf = [0; 4];
	c.encode_utf8(&mut buf);
	stdout.write(&buf[..len]).unwrap();
	}
	State::Normal
	}
	}
	},
	_ => State::Normal,
	}
	}
	}