-
-
Save ssokolow/0d9f5c5e4a8a37a962875af205bcc723 to your computer and use it in GitHub Desktop.
/* POSIX paths in JSON via escaping which | |
doesn't alter valid UTF-8 paths. | |
The trick is recognizing that JSON can store binary nulls in strings | |
but nulls are the only character that can't occur in POSIX paths, | |
so we can use it as an escape character that won't change how existing | |
serialized paths get interpreted. | |
Copyright 2018-2020, Stephan Sokolow | |
This code is released under your choice of the MIT or Apache-2.0 licenses. | |
https://opensource.org/licenses/MIT | |
https://opensource.org/licenses/Apache-2.0 | |
*/ | |
use std::borrow::Cow; | |
use std::ffi::{OsStr, OsString}; | |
use std::str; | |
// Platform-specific imports | |
use std::os::unix::ffi::{OsStrExt, OsStringExt}; | |
/// Escape an OS path into something which can safely be stored in a valid UTF-8 string | |
fn escape_path<P: AsRef<OsStr> + ?Sized>(path: &P) -> Cow<'_, str> { | |
escape_path_inner(path.as_ref()) | |
} | |
/// Inner function for `escape_path` to avoid the risk of monomorphization bloat | |
/// | |
/// Adapted from the example code on the `std::str::Utf8Error` rustdoc page | |
/// TODO: Support Windows... ideally in a way that results in the same conversion logic | |
/// as ntfs-3g uses. | |
fn escape_path_inner(path: &OsStr) -> Cow<'_, str> { | |
if let Some(path_str) = path.to_str() { | |
if !path_str.contains('\0') { | |
// In the by-far most common case, just do a validity check and a copy | |
// (According to Criterion, this halves the common-case runtime in exchange for | |
// a 6-20% (+/- 5%) slow-down in the case where escaping is needed.) | |
// | |
// An if/else here which directs "valid but with \0" to str::replace was shown | |
// by Criterion to result in a ~33% slowdown for a test string with four \0 in it | |
// and a ~45% slowdown for a test string with seven \0 in it. | |
return Cow::from(path_str.to_owned()); | |
} | |
} | |
// In the very uncommon case, make a copy of the string with invalid bytes escaped | |
let mut input = path.as_bytes(); | |
// Preallocate for four escapes | |
// (Just a guess, based on four mojibake'd latin1 bytes, two UTF-16 surrogates, | |
// or one UTF-32 character) | |
let mut result = String::with_capacity(path.len().saturating_add(4)); | |
loop { | |
// Allowed because it should be impossible for invalid_sequence_length to | |
// be outside the range of after_valid. Tests should cover all cases, so | |
// it doesn't make sense to change the API to return a Result when the | |
// fallible path should be impossible. | |
// | |
// TODO: Move this to the &after_valid[...] once attributes on expressions are no | |
// longer experimental. | |
#[allow(clippy::indexing_slicing)] | |
match str::from_utf8(input) { | |
// TODO: Try rewriting this into something faster | |
Ok(valid) => { | |
// Escape binary nulls inside a fully valid string so they round-trip properly | |
for u_char in valid.chars() { | |
if u_char == '\0' { result.push('\0'); } | |
result.push(u_char); | |
} | |
break | |
} | |
Err(error) => { | |
// Pass through the valid span | |
let (valid, after_valid) = input.split_at(error.valid_up_to()); | |
// Escape binary nulls inside the valid span so they round-trip properly | |
#[allow(clippy::result_expect_used)] | |
for u_char in str::from_utf8(valid).expect( | |
"from_utf8 on left-hand output of valid_up_to()").chars() { | |
if u_char == '\0' { result.push('\0'); } | |
result.push(u_char); | |
} | |
// Escape any following invalid characters | |
let invalid_sequence_length = match error.error_len() { | |
Some(length) => length, | |
None => after_valid.len() | |
}; | |
for &byte in after_valid.iter().take(invalid_sequence_length) { | |
result.push('\0'); | |
result.push(byte.into()); | |
} | |
// Step forward to the next span or end the loop | |
if let Some(invalid_sequence_length) = error.error_len() { | |
input = &after_valid[invalid_sequence_length..] | |
} else { | |
break | |
} | |
} | |
} | |
} | |
return Cow::from(result); | |
} | |
/// Take the output from `escape_path` and change it back into an OS string | |
/// | |
/// (`allow(dead_code)` because its purpose is to exist on standby and pass unit tests, | |
/// awaiting the need to unescape my emergency records.) | |
#[allow(dead_code)] | |
fn unescape_path(path: &str) -> Cow<'_, OsStr> { | |
// In the by-far most common case, just check for \0 and return a Cow<OsStr> | |
// (According to Criterion, skipping the unescaping code for the common case results in a 360%+ | |
// speed-up in the common case with no statistically significant change in the case where | |
// things need to be unescaped.) | |
if !path.contains('\0') { | |
return Cow::from(OsStr::new(path)); | |
} | |
// Otherwise, unescape the escaped bytes | |
// TODO: Come up with a nicer way to do this | |
let mut result: Vec<u8> = Vec::with_capacity(path.len()); | |
let mut utf8_buf = [0_u8; 4]; | |
let mut raw_next = false; | |
for u_char in path.chars() { | |
if raw_next { | |
result.push(u_char as u8); | |
raw_next = false; | |
} else if u_char == '\0' { | |
raw_next = true; | |
} else { | |
result.extend(u_char.encode_utf8(&mut utf8_buf).as_bytes()); | |
} | |
} | |
return Cow::from(OsString::from_vec(result)); | |
} | |
#[cfg(test)] | |
mod tests { | |
use std::ffi::OsString; | |
use std::os::unix::ffi::OsStringExt; | |
use super::{escape_path, unescape_path}; | |
const TEST_STRINGS: &[(&[u8], &str)] = &[ | |
// all valid utf-8 | |
(b"string with no invalid utf-8", "string with no invalid utf-8"), | |
// typical string with invalid utf-8 | |
(b"/un/fichier/fran\xe7ais", "/un/fichier/fran\0\u{00e7}ais"), | |
// starting with invalid utf-8 | |
(b"\xe7a va", "\0\u{00e7}a va"), | |
// invalid span length > 1 | |
(b"foo\xe7\xe7bar", "foo\0\u{00e7}\0\u{00e7}bar"), | |
// only invalid characters | |
(b"\xe7\xe7", "\0\u{00e7}\0\u{00e7}"), | |
// empty string | |
(b"", ""), | |
// ending with invalid utf-8 less than 3 characters (see utf8error::error_len) | |
(b"foo\xe7", "foo\0\u{00e7}"), | |
(b"foo\xe7\xe7", "foo\0\u{00e7}\0\u{00e7}"), | |
// ending with invalid utf-8 more than 3 characters (see utf8error::error_len) | |
(b"foo\xe7\xe7\xe7\xe7", "foo\0\u{00e7}\0\u{00e7}\0\u{00e7}\0\u{00e7}"), | |
// all valid utf-8, but with nulls | |
(b"\0string with no\0\0invalid utf-8\0", "\0\0string with no\0\0\0\0invalid utf-8\0\0"), | |
]; | |
/// Test that escape_path works properly | |
#[test] | |
fn test_escape_path() { | |
for (input, expected) in TEST_STRINGS { | |
let os_string = OsString::from_vec(input.to_vec()); | |
let escaped = escape_path(&os_string); | |
assert_eq!(escaped, *expected); | |
} | |
} | |
/// Test that unescape_path is symmetrical to escape_path | |
#[test] | |
fn test_unescape_path() { | |
for (input, _expected) in TEST_STRINGS { | |
let os_string = OsString::from_vec(input.to_vec()); | |
let escaped = escape_path(&os_string); | |
assert_eq!(&unescape_path(&escaped), &os_string.as_os_str()); | |
} | |
} | |
/// Test that nulls in valid UTF-8 round-trip successfully | |
#[test] | |
fn test_null_round_tripping() { | |
let test_strings: &[(&[u8], &str)] = &[ | |
(b"\0foo", "\0\0foo"), | |
(b"foo\0bar", "foo\0\0bar"), | |
(b"foo\0\0bar", "foo\0\0\0\0bar"), | |
(b"foo\0", "foo\0\0"), | |
(b"\0foo\0bar\xe7baz\0\0quux\0", "\0\0foo\0\0bar\0\u{00e7}baz\0\0\0\0quux\0\0"), | |
]; | |
for (in_vec, expected_escaped) in test_strings { | |
let os_string = OsString::from_vec(in_vec.to_vec()); | |
let escaped = escape_path(&os_string); | |
assert_eq!(&escaped, expected_escaped); | |
let round_tripped = unescape_path(&escaped); | |
assert_eq!(os_string, round_tripped); | |
} | |
} | |
} |
Fixed. It now escapes \0
as \0\0
so it will...
- Successfully round-trip all valid
OsStr
/OsString
contents. - Leave all POSIX filesystem paths which are valid UTF-8 unchanged.
- Use
\0
as an escape character to storeb"\xe7"
as\0\u{00e7}
. - Encode
\0
as\0\0
so it can be distinguished from use of\0
as an escape character.
This should be fully backwards compatible with serde_json
's existing behaviour, since serde_json fails if a Path
or PathBuf
contains invalid UTF-8.
Super interested in this gist as I am facing the same issue but with msgpack.
If I am not wrong the let mut input = path.as_bytes();
is Unix specific. How do you intend to handle the path on windows?
On the 2nd thought... may be it does not need special handling on windows because I think windows enforces UTF-16.
If I am not wrong the
let mut input = path.as_bytes();
is Unix specific. How do you intend to handle the path on windows?On the 2nd thought... may be it does not need special handling on windows because I think windows enforces UTF-16.
Windows allows un-paired surrogates, which are also forbidden in UTF-8. (For compatibility with filenames generated back when Unicode was expected to only be a 16-bit fixed-width encoding, and the encoding used was UCS-2 rather than UTF-16.)
The Windows equivalent to as_bytes()
is encode_wide()
provided by the std::os::windows::ffi::OsStringExt
trait.
The plan I keep not having time to enact is to look up how to generate such ill-formed paths if I borrow my brother's Windows 10 PC for a couple of minutes, put them on an NTFS-formatted flash drive, stick it into my Linux PC, and then replicate whatever behaviour the ntfs-3g
NTFS driver implements for translating such filenames between the Linux and Windows worlds.
In the interest of interoperable, panic-free storage of POSIX paths in JSON or other UTF-8-requiring formats, I'm also willing to release this under other licenses if you need that.
Bear in mind that, as-is, it assumes it's receiving a path so it does not escape
\0
as\0\0
. I'm willing to add that if anyone wants something suitable for allOsStr
andOsString
values that are unlikely to contain a\0
but it can't be ruled out.