Created
August 24, 2024 03:13
-
-
Save milkey-mouse/d90982ee9e2d7d032f3e28e99cdbd858 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// Copyright (c) 2008-2010 Bjoern Hoehrmann <[email protected]> | |
// See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details. | |
use std::{error::Error, fmt::Display}; | |
#[derive(Debug)] | |
pub struct Utf8Error; | |
impl Display for Utf8Error { | |
fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { | |
write!(f, "invalid utf-8") | |
} | |
} | |
impl Error for Utf8Error {} | |
const ACCEPT: u8 = 0; | |
const REJECT: u8 = 12; | |
#[rustfmt::skip] | |
const UTF8D: [u8; 364] = [ | |
// The first part of the table maps bytes to character classes that | |
// to reduce the size of the transition table and create bitmasks. | |
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, | |
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, | |
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, | |
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, | |
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, | |
7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, | |
8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, | |
10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8, | |
// The second part is a transition table that maps a combination | |
// of a state of the automaton and a character class to a state. | |
0,12,24,36,60,96,84,12,12,12,48,72, 12,12,12,12,12,12,12,12,12,12,12,12, | |
12, 0,12,12,12,12,12, 0,12, 0,12,12, 12,24,12,12,12,12,12,24,12,24,12,12, | |
12,12,12,12,12,12,12,24,12,12,12,12, 12,24,12,12,12,12,12,12,12,24,12,12, | |
12,12,12,12,12,12,12,36,12,36,12,12, 12,36,12,12,12,12,12,36,12,36,12,12, | |
12,36,12,12,12,12,12,12,12,12,12,12, | |
]; | |
pub struct Utf8Decoder { | |
codep: u32, | |
state: u8, | |
} | |
impl Utf8Decoder { | |
pub fn new() -> Self { | |
Utf8Decoder { | |
state: ACCEPT, | |
codep: 0, | |
} | |
} | |
#[inline] | |
pub fn decode(&mut self, byte: u8) -> Result<Option<char>, Utf8Error> { | |
let byte = byte as u32; | |
let typ = UTF8D[byte as usize] as u32; | |
self.codep = if self.state != ACCEPT { | |
(byte & 0x3f) | (self.codep << 6) | |
} else { | |
(0xff >> typ) & byte | |
}; | |
self.state = UTF8D[256 + self.state as usize + typ as usize]; | |
match self.state { | |
ACCEPT => unsafe { Ok(Some(char::from_u32_unchecked(self.codep))) }, | |
REJECT => Err(Utf8Error), | |
_ => Ok(None), | |
} | |
} | |
#[inline] | |
pub fn valid(&self) -> bool { | |
self.state == ACCEPT | |
} | |
#[inline] | |
pub fn invalid(&self) -> bool { | |
self.state == REJECT | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment