Last active
January 10, 2020 08:18
-
-
Save madig/c56d88abc71705235309cbdaee882035 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
fn quoted_string_literal(&mut self, quote_char: u8) -> Result<Option<Event>, Error> { | |
let mut acc: Vec<u8> = Vec::new(); | |
let mut cur_char = quote_char; | |
while { | |
match self.peeked_char { | |
// do not stop if the quote is escaped | |
Some(c) => c != quote_char || cur_char == b'\\', | |
None => false, | |
} | |
} { | |
// consuming the string itself | |
match self.advance()? { | |
Some(c) => { | |
cur_char = c; | |
// interpret escaped char | |
if cur_char == b'\\' { | |
match self.advance()? { | |
Some(c) => match c as char { | |
'0'..='7' => match self.advance()? { | |
// read 3 chars | |
Some(c2) => match c as char { | |
'0'..='7' => match self.advance()? { | |
Some(c3) => match c as char { | |
'0'..='7' => { | |
let p1 = (c as u8) - b'0'; | |
let p2 = (c2 as u8) - b'0'; | |
let p3 = (c3 as u8) - b'0'; | |
let num: u8 = (((p1 << 3) + p2) << 3) + p3; | |
if num < 128 { | |
// ASCII character | |
acc.push(num) | |
} else { | |
// character in NextStep encoding | |
let uchar = NEXT_STEP_DECODING_TABLE | |
[(num - 128) as usize]; | |
let mut uchar_utf8 = [0u8; 4]; | |
for char_byte in uchar | |
.encode_utf8(&mut uchar_utf8) | |
.bytes() | |
{ | |
acc.push(char_byte) | |
} | |
}; | |
cur_char = 0; // clear slash so that e.g. "\377" does not scan past the closing quote. | |
} | |
_ => { | |
return Err(self | |
.error(ErrorKind::InvalidUtf8AsciiStream)) | |
} | |
}, | |
None => { | |
return Err(self.error(ErrorKind::UnclosedString)) | |
} | |
}, | |
_ => { | |
return Err( | |
self.error(ErrorKind::InvalidUtf8AsciiStream) | |
) | |
} | |
}, | |
None => return Err(self.error(ErrorKind::UnclosedString)), | |
}, | |
'U' => { | |
let mut uchar_num: u32 = 0; | |
for _ in 1..=4 { | |
match self.advance()? { | |
Some(c) => cur_char = c, | |
None => { | |
return Err( | |
self.error(ErrorKind::InvalidUtf8AsciiStream) | |
) | |
} | |
} | |
if !cur_char.is_ascii_hexdigit() { | |
return Err( | |
self.error(ErrorKind::InvalidUtf8AsciiStream) | |
); | |
} | |
uchar_num = (uchar_num << 4) | |
+ (cur_char as char).to_digit(16).unwrap(); | |
} | |
let mut uchar_utf8 = [0u8; 4]; | |
for char_byte in char::from_u32(uchar_num) | |
.unwrap() | |
.encode_utf8(&mut uchar_utf8) | |
.bytes() | |
{ | |
acc.push(char_byte) | |
} | |
} | |
'a' => acc.push(0x07), | |
'b' => acc.push(0x08), | |
'f' => acc.push(0x0c), | |
'n' => acc.push('\n' as u8), | |
'r' => acc.push('\r' as u8), | |
't' => acc.push('\t' as u8), | |
'v' => acc.push(0x0B), | |
'"' => acc.push('"' as u8), | |
'\n' => acc.push('\n' as u8), | |
_ => acc.push(c as u8), | |
}, | |
None => return Err(self.error(ErrorKind::UnclosedString)), | |
}; | |
} else { | |
acc.push(cur_char) | |
} | |
} | |
None => return Err(self.error(ErrorKind::UnclosedString)), | |
}; | |
} | |
// Match the closing quote. | |
match self.advance()? { | |
Some(c) => { | |
if c == quote_char { | |
let string_literal = String::from_utf8(acc) | |
.map_err(|_e| self.error(ErrorKind::InvalidUtf8AsciiStream))?; | |
Ok(Some(Event::String(string_literal))) | |
} else { | |
Err(self.error(ErrorKind::UnclosedString)) | |
} | |
} | |
None => Err(self.error(ErrorKind::UnclosedString)), | |
} | |
} | |
// | |
b'"' | b'\'' => return self.quoted_string_literal(c), | |
// | |
/// Table mapping from NextStep Encoding to Unicode characters, used | |
/// for decoding octal escaped character codes within quoted plist strings. | |
/// Since the first 128 characters (0x0 - 0x7f) are identical to ASCII | |
/// and Unicode, the table only maps NextStep range from 0x80 - 0xFF. | |
/// Source: ftp://ftp.unicode.org/Public/MAPPINGS/VENDORS/NEXT/NEXTSTEP.TXT | |
static NEXT_STEP_DECODING_TABLE: [char; 128] = [ | |
'\u{A0}', '\u{C0}', '\u{C1}', '\u{C2}', '\u{C3}', '\u{C4}', '\u{C5}', '\u{C7}', '\u{C8}', | |
'\u{C9}', '\u{CA}', '\u{CB}', '\u{CC}', '\u{CD}', '\u{CE}', '\u{CF}', '\u{D0}', '\u{D1}', | |
'\u{D2}', '\u{D3}', '\u{D4}', '\u{D5}', '\u{D6}', '\u{D9}', '\u{DA}', '\u{DB}', '\u{DC}', | |
'\u{DD}', '\u{DE}', '\u{B5}', '\u{D7}', '\u{F7}', '\u{A9}', '\u{A1}', '\u{A2}', '\u{A3}', | |
'\u{2044}', '\u{A5}', '\u{192}', '\u{A7}', '\u{A4}', '\u{2019}', '\u{201C}', '\u{AB}', | |
'\u{2039}', '\u{203A}', '\u{FB01}', '\u{FB02}', '\u{AE}', '\u{2013}', '\u{2020}', '\u{2021}', | |
'\u{B7}', '\u{A6}', '\u{B6}', '\u{2022}', '\u{201A}', '\u{201E}', '\u{201D}', '\u{BB}', | |
'\u{2026}', '\u{2030}', '\u{AC}', '\u{BF}', '\u{B9}', '\u{2CB}', '\u{B4}', '\u{2C6}', | |
'\u{2DC}', '\u{AF}', '\u{2D8}', '\u{2D9}', '\u{A8}', '\u{B2}', '\u{2DA}', '\u{B8}', '\u{B3}', | |
'\u{2DD}', '\u{2DB}', '\u{2C7}', '\u{2014}', '\u{B1}', '\u{BC}', '\u{BD}', '\u{BE}', '\u{E0}', | |
'\u{E1}', '\u{E2}', '\u{E3}', '\u{E4}', '\u{E5}', '\u{E7}', '\u{E8}', '\u{E9}', '\u{EA}', | |
'\u{EB}', '\u{EC}', '\u{C6}', '\u{ED}', '\u{AA}', '\u{EE}', '\u{EF}', '\u{F0}', '\u{F1}', | |
'\u{141}', '\u{D8}', '\u{152}', '\u{BA}', '\u{F2}', '\u{F3}', '\u{F4}', '\u{F5}', '\u{F6}', | |
'\u{E6}', '\u{F9}', '\u{FA}', '\u{FB}', '\u{131}', '\u{FC}', '\u{FD}', '\u{142}', '\u{F8}', | |
'\u{153}', '\u{DF}', '\u{FE}', '\u{FF}', '\u{FFFD}', '\u{FFFD}', | |
]; | |
// | |
#[test] | |
fn escaped_sequences_in_strings() { | |
let plist = r#"{ | |
key0 = ""; | |
key1 = "va\"lue"; | |
key2 = 'va"lue'; | |
key3 = "va\a\b\f\n\r\t\v\"\nlue"; | |
key4 = "a\012b\200\377"; | |
key5 = "\\UD83D\\UDCA9"; | |
key6 = "\UD83D\UDCA9"; | |
}"#; | |
let cursor = Cursor::new(plist.as_bytes()); | |
let streaming_parser = AsciiReader::new(cursor); | |
let events: Vec<Event> = streaming_parser.map(|e| e.unwrap()).collect(); | |
let comparison = &[ | |
StartDictionary(None), | |
String("key0".to_owned()), | |
String("".to_owned()), | |
String("key1".to_owned()), | |
String(r#"va"lue"#.to_owned()), | |
String("key2".to_owned()), | |
String(r#"va"lue"#.to_owned()), | |
String("key3".to_owned()), | |
String("va\u{07}\u{08}\u{0C}\n\r\t\u{0B}\"\nlue".to_owned()), | |
String("key4".to_owned()), | |
String("a\nb\u{A0}\u{FFFD}".to_owned()), | |
String("key5".to_owned()), | |
String("\\UD83D\\UDCA9".to_owned()), | |
String("key6".to_owned()), | |
String("💩".to_owned()), | |
EndCollection, | |
]; | |
assert_eq!(events, comparison); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment