Skip to content

Instantly share code, notes, and snippets.

@Plecra
Last active June 14, 2020 13:24
Show Gist options
  • Save Plecra/50fd6820fee61e28c2e4121a98ad8d3e to your computer and use it in GitHub Desktop.
Save Plecra/50fd6820fee61e28c2e4121a98ad8d3e to your computer and use it in GitHub Desktop.
A Rust Modified Utf-8 decoder
pub enum ModifiedUtf8Error {
Incomplete,
Malformed,
}
/// Decode a Modified UTF-8 encoded string.
///
/// This encoding is used by Java for fast serialization of its strings, and
/// has some minor deviations from standard UTF-8.
pub fn from_modified_utf8(mut bytes: &[u8]) -> Result<String, ModifiedUtf8Error> {
// Preallocate the biggest buffer we might need
let mut string = String::with_capacity(bytes.len() - 1);
loop {
// Treat it as normal UTF-8 - since the encodings are so similar, we can
// reuse the original agorithm
match std::str::from_utf8(bytes) {
Ok(s) => {
string.push_str(s);
// We've reached the end of the string
return Ok(string);
}
Err(e) => {
string.push_str(unsafe {
// SAFETY: https://doc.rust-lang.org/std/str/struct.Utf8Error.html#method.valid_up_to
std::str::from_utf8_unchecked(bytes.get_unchecked(..e.valid_up_to()))
});
// We have encountered some bytes that aren't UTF-8. If they're valid
// Modified UTF-8, decode them
match unsafe { bytes.get_unchecked(e.valid_up_to()..) } {
[0b1100_0000, 0b1000_0000, rest @ ..] => {
string.push_str("\0");
bytes = rest
}
// The surrogate code units match the bit mask 0b1101_1yxx_xxxx_xxxx,
// where `y` is high for the high surrogate and vice versa.
// Encoded in UTF-8, that's 0b1110_1101 0b101y_xxxx 0b10xx_xxxx.
// They must come as a (high, low) pair, so we can match on the
// whole pattern.
[
0b1110_1101,
second @ 0b1010_0000..=0b1010_1111,
third @ 0b1000_0000..=0b1011_1111,
0b1110_1101,
fifth @ 0b1011_0000..=0b1011_1111,
sixth @ 0b1000_0000..=0b1011_1111,
rest @ ..
] => {
// Decode from UTF-8
let high_surrogate =
(((second & 0b0000_1111) as u32) << 6) | (third & 0b0011_1111) as u32;
let low_surrogate =
(((fifth & 0b0000_1111) as u32) << 6) | (sixth & 0b0011_1111) as u32;
let chr = unsafe {
// Decode the surrogate pair.
// SAFETY: Each surrogate is in the range 0-0x3FF (10 bits wide), and the
// maximum value of 0x10FFFF (0x10000 + 0x3FF * 2^10 + 0x3FF) lands this
// within the Supplementary Code Point range (0x10000-0x10FFFF)
std::char::from_u32_unchecked(
0x10000 + ((high_surrogate << 10) | low_surrogate),
)
};
let mut buf = [0; 4];
string.push_str(chr.encode_utf8(&mut buf));
bytes = rest;
}
[] => return Err(ModifiedUtf8Error::Incomplete),
[..] => return Err(ModifiedUtf8Error::Malformed),
};
}
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment