Skip to content

Instantly share code, notes, and snippets.

@milkey-mouse
Created August 24, 2024 03:13
Show Gist options
  • Save milkey-mouse/d90982ee9e2d7d032f3e28e99cdbd858 to your computer and use it in GitHub Desktop.
Save milkey-mouse/d90982ee9e2d7d032f3e28e99cdbd858 to your computer and use it in GitHub Desktop.
// Copyright (c) 2008-2010 Bjoern Hoehrmann <[email protected]>
// See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details.
use std::{error::Error, fmt::Display};
#[derive(Debug)]
pub struct Utf8Error;
impl Display for Utf8Error {
fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
write!(f, "invalid utf-8")
}
}
impl Error for Utf8Error {}
const ACCEPT: u8 = 0;
const REJECT: u8 = 12;
#[rustfmt::skip]
const UTF8D: [u8; 364] = [
// The first part of the table maps bytes to character classes that
// to reduce the size of the transition table and create bitmasks.
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8,
// The second part is a transition table that maps a combination
// of a state of the automaton and a character class to a state.
0,12,24,36,60,96,84,12,12,12,48,72, 12,12,12,12,12,12,12,12,12,12,12,12,
12, 0,12,12,12,12,12, 0,12, 0,12,12, 12,24,12,12,12,12,12,24,12,24,12,12,
12,12,12,12,12,12,12,24,12,12,12,12, 12,24,12,12,12,12,12,12,12,24,12,12,
12,12,12,12,12,12,12,36,12,36,12,12, 12,36,12,12,12,12,12,36,12,36,12,12,
12,36,12,12,12,12,12,12,12,12,12,12,
];
pub struct Utf8Decoder {
codep: u32,
state: u8,
}
impl Utf8Decoder {
pub fn new() -> Self {
Utf8Decoder {
state: ACCEPT,
codep: 0,
}
}
#[inline]
pub fn decode(&mut self, byte: u8) -> Result<Option<char>, Utf8Error> {
let byte = byte as u32;
let typ = UTF8D[byte as usize] as u32;
self.codep = if self.state != ACCEPT {
(byte & 0x3f) | (self.codep << 6)
} else {
(0xff >> typ) & byte
};
self.state = UTF8D[256 + self.state as usize + typ as usize];
match self.state {
ACCEPT => unsafe { Ok(Some(char::from_u32_unchecked(self.codep))) },
REJECT => Err(Utf8Error),
_ => Ok(None),
}
}
#[inline]
pub fn valid(&self) -> bool {
self.state == ACCEPT
}
#[inline]
pub fn invalid(&self) -> bool {
self.state == REJECT
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment