Last active
June 14, 2020 13:24
-
-
Save Plecra/50fd6820fee61e28c2e4121a98ad8d3e to your computer and use it in GitHub Desktop.
A Rust Modified Utf-8 decoder
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
pub enum ModifiedUtf8Error { | |
Incomplete, | |
Malformed, | |
} | |
/// Decode a Modified UTF-8 encoded string. | |
/// | |
/// This encoding is used by Java for fast serialization of its strings, and | |
/// has some minor deviations from standard UTF-8. | |
pub fn from_modified_utf8(mut bytes: &[u8]) -> Result<String, ModifiedUtf8Error> { | |
// Preallocate the biggest buffer we might need | |
let mut string = String::with_capacity(bytes.len() - 1); | |
loop { | |
// Treat it as normal UTF-8 - since the encodings are so similar, we can | |
// reuse the original agorithm | |
match std::str::from_utf8(bytes) { | |
Ok(s) => { | |
string.push_str(s); | |
// We've reached the end of the string | |
return Ok(string); | |
} | |
Err(e) => { | |
string.push_str(unsafe { | |
// SAFETY: https://doc.rust-lang.org/std/str/struct.Utf8Error.html#method.valid_up_to | |
std::str::from_utf8_unchecked(bytes.get_unchecked(..e.valid_up_to())) | |
}); | |
// We have encountered some bytes that aren't UTF-8. If they're valid | |
// Modified UTF-8, decode them | |
match unsafe { bytes.get_unchecked(e.valid_up_to()..) } { | |
[0b1100_0000, 0b1000_0000, rest @ ..] => { | |
string.push_str("\0"); | |
bytes = rest | |
} | |
// The surrogate code units match the bit mask 0b1101_1yxx_xxxx_xxxx, | |
// where `y` is high for the high surrogate and vice versa. | |
// Encoded in UTF-8, that's 0b1110_1101 0b101y_xxxx 0b10xx_xxxx. | |
// They must come as a (high, low) pair, so we can match on the | |
// whole pattern. | |
[ | |
0b1110_1101, | |
second @ 0b1010_0000..=0b1010_1111, | |
third @ 0b1000_0000..=0b1011_1111, | |
0b1110_1101, | |
fifth @ 0b1011_0000..=0b1011_1111, | |
sixth @ 0b1000_0000..=0b1011_1111, | |
rest @ .. | |
] => { | |
// Decode from UTF-8 | |
let high_surrogate = | |
(((second & 0b0000_1111) as u32) << 6) | (third & 0b0011_1111) as u32; | |
let low_surrogate = | |
(((fifth & 0b0000_1111) as u32) << 6) | (sixth & 0b0011_1111) as u32; | |
let chr = unsafe { | |
// Decode the surrogate pair. | |
// SAFETY: Each surrogate is in the range 0-0x3FF (10 bits wide), and the | |
// maximum value of 0x10FFFF (0x10000 + 0x3FF * 2^10 + 0x3FF) lands this | |
// within the Supplementary Code Point range (0x10000-0x10FFFF) | |
std::char::from_u32_unchecked( | |
0x10000 + ((high_surrogate << 10) | low_surrogate), | |
) | |
}; | |
let mut buf = [0; 4]; | |
string.push_str(chr.encode_utf8(&mut buf)); | |
bytes = rest; | |
} | |
[] => return Err(ModifiedUtf8Error::Incomplete), | |
[..] => return Err(ModifiedUtf8Error::Malformed), | |
}; | |
} | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment