Last active
August 31, 2025 08:18
-
-
Save micolous/e8ab00d741bc27fa3b1657ab9ce41199 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
//! Workaround for https://github.com/rust-bakery/nom/issues/1679 | |
use core::marker::PhantomData; | |
use nom::{ | |
error::{ErrorKind, ParseError}, | |
AsChar, Err, ExtendInto, Input, IsStreaming, Mode, Needed, Offset, OutputMode, Parser, | |
}; | |
/// Matches a byte string with escaped characters. | |
/// | |
/// * The first argument matches the normal characters (it must not match the control character) | |
/// * The second argument is the control character (like `\` in most languages) | |
/// * The third argument matches the escaped characters and transforms them | |
/// | |
/// As an example, the chain `abc\tdef` could be `abc def` (it also consumes the control character) | |
/// | |
/// ```compile_fail | |
/// # use nom::{Err, error::ErrorKind, Needed, IResult}; | |
/// # use std::str::from_utf8; | |
/// use nom::bytes::streaming::{escaped_transform, tag}; | |
/// use nom::character::streaming::alpha1; | |
/// use nom::branch::alt; | |
/// use nom::combinator::value; | |
/// | |
/// fn parser(input: &[u8]) -> IResult<&[u8], Vec<u8>> { | |
/// escaped_transform( | |
/// alpha1, | |
/// b'\\', | |
/// alt(( | |
/// value(b"\\", tag(b"\\".as_slice())), | |
/// value(b"\"", tag(b"\"".as_slice())), | |
/// value(b"\n", tag(b"n".as_slice())), | |
/// )) | |
/// )(input) | |
/// } | |
/// | |
/// assert_eq!(parser(b"ab\\\"cd\""), Ok((b"\"".as_slice(), b"ab\"cd".to_vec()))); | |
/// ``` | |
/// | |
/// This is a modified version of [nom's `escaped_transform`][nom::bytes::escaped_transform] | |
/// which takes an `control_character` as `u8`, rather than `char`. | |
pub fn escaped_transform<I, Error, F, G, ExtendItem, Output>( | |
normal: F, | |
control_char: impl AsChar, | |
transform: G, | |
) -> impl Parser<I, Output = Output, Error = Error> | |
where | |
I: Clone + Offset + Input, | |
I: ExtendInto<Item = ExtendItem, Extender = Output>, | |
<F as Parser<I>>::Output: ExtendInto<Item = ExtendItem, Extender = Output>, | |
<G as Parser<I>>::Output: ExtendInto<Item = ExtendItem, Extender = Output>, | |
<I as Input>::Item: AsChar, | |
F: Parser<I, Error = Error>, | |
G: Parser<I, Error = Error>, | |
Error: ParseError<I>, | |
{ | |
EscapedTransform { | |
normal, | |
control_char, | |
transform, | |
e: PhantomData, | |
extend: PhantomData, | |
o: PhantomData, | |
} | |
} | |
/// Parser implementation for [escaped_transform] | |
pub struct EscapedTransform<F, G, E, ExtendItem, Output, C> { | |
normal: F, | |
transform: G, | |
control_char: C, | |
e: PhantomData<E>, | |
extend: PhantomData<ExtendItem>, | |
o: PhantomData<Output>, | |
} | |
impl<I, Error: ParseError<I>, F, G, ExtendItem, Output, C> Parser<I> | |
for EscapedTransform<F, G, Error, ExtendItem, Output, C> | |
where | |
I: Clone + Offset + Input, | |
I: ExtendInto<Item = ExtendItem, Extender = Output>, | |
<F as Parser<I>>::Output: ExtendInto<Item = ExtendItem, Extender = Output>, | |
<G as Parser<I>>::Output: ExtendInto<Item = ExtendItem, Extender = Output>, | |
<I as Input>::Item: AsChar, | |
C: AsChar, | |
F: Parser<I, Error = Error>, | |
G: Parser<I, Error = Error>, | |
Error: ParseError<I>, | |
{ | |
type Output = Output; | |
type Error = Error; | |
fn process<OM: OutputMode>( | |
&mut self, | |
input: I, | |
) -> nom::PResult<OM, I, Self::Output, Self::Error> { | |
let mut index = 0; | |
let mut res = OM::Output::bind(|| input.new_builder()); | |
while index < input.input_len() { | |
let current_len = input.input_len(); | |
let remainder = input.take_from(index); | |
match self.normal.process::<OM>(remainder.clone()) { | |
Ok((i2, o)) => { | |
res = OM::Output::combine(o, res, |o, mut res| { | |
o.extend_into(&mut res); | |
res | |
}); | |
if i2.input_len() == 0 { | |
if OM::Incomplete::is_streaming() { | |
return Err(Err::Incomplete(Needed::Unknown)); | |
} else { | |
let index = input.input_len(); | |
return Ok((input.take_from(index), res)); | |
} | |
} else if i2.input_len() == current_len { | |
return Ok((remainder, res)); | |
} else { | |
index = input.offset(&i2); | |
} | |
} | |
Err(Err::Error(_)) => { | |
// unwrap() should be safe here since index < $i.input_len() | |
if remainder.iter_elements().next().unwrap().as_char() | |
== self.control_char.as_char() | |
{ | |
let next = index + self.control_char.len(); | |
let input_len = input.input_len(); | |
if next >= input_len { | |
if OM::Incomplete::is_streaming() { | |
return Err(Err::Incomplete(Needed::Unknown)); | |
} else { | |
return Err(Err::Error(OM::Error::bind(|| { | |
Error::from_error_kind(remainder, ErrorKind::EscapedTransform) | |
}))); | |
} | |
} else { | |
match self.transform.process::<OM>(input.take_from(next)) { | |
Ok((i2, o)) => { | |
res = OM::Output::combine(o, res, |o, mut res| { | |
o.extend_into(&mut res); | |
res | |
}); | |
if i2.input_len() == 0 { | |
if OM::Incomplete::is_streaming() { | |
return Err(Err::Incomplete(Needed::Unknown)); | |
} else { | |
return Ok((input.take_from(input.input_len()), res)); | |
} | |
} else { | |
index = input.offset(&i2); | |
} | |
} | |
Err(Err::Error(e)) => return Err(Err::Error(e)), | |
Err(Err::Failure(e)) => { | |
return Err(Err::Failure(e)); | |
} | |
Err(Err::Incomplete(i)) => { | |
return Err(Err::Incomplete(i)); | |
} | |
} | |
} | |
} else { | |
if index == 0 { | |
return Err(Err::Error(OM::Error::bind(|| { | |
Error::from_error_kind(remainder, ErrorKind::EscapedTransform) | |
}))); | |
} | |
return Ok((remainder, res)); | |
} | |
} | |
Err(Err::Failure(e)) => { | |
return Err(Err::Failure(e)); | |
} | |
Err(Err::Incomplete(i)) => { | |
return Err(Err::Incomplete(i)); | |
} | |
} | |
} | |
if OM::Incomplete::is_streaming() { | |
Err(Err::Incomplete(Needed::Unknown)) | |
} else { | |
Ok((input.take_from(index), res)) | |
} | |
} | |
} | |
pub mod complete { | |
use super::*; | |
use nom::{Complete, Emit, IResult, OutputM}; | |
pub fn escaped_transform<I, Error, F, G, O1, O2, ExtendItem, Output>( | |
normal: F, | |
control_char: impl AsChar, | |
transform: G, | |
) -> impl FnMut(I) -> IResult<I, Output, Error> | |
where | |
I: Clone + Offset + Input, | |
I: ExtendInto<Item = ExtendItem, Extender = Output>, | |
O1: ExtendInto<Item = ExtendItem, Extender = Output>, | |
O2: ExtendInto<Item = ExtendItem, Extender = Output>, | |
<I as Input>::Item: AsChar, | |
F: Parser<I, Output = O1, Error = Error>, | |
G: Parser<I, Output = O2, Error = Error>, | |
Error: ParseError<I>, | |
{ | |
let mut parser = super::escaped_transform(normal, control_char, transform); | |
move |i: I| parser.process::<OutputM<Emit, Emit, Complete>>(i) | |
} | |
} | |
pub mod streaming { | |
use super::*; | |
use nom::{Emit, IResult, OutputM, Streaming}; | |
pub fn escaped_transform<I, Error, F, G, O1, O2, ExtendItem, Output>( | |
normal: F, | |
control_char: impl AsChar, | |
transform: G, | |
) -> impl FnMut(I) -> IResult<I, Output, Error> | |
where | |
I: Clone + Offset + Input, | |
I: ExtendInto<Item = ExtendItem, Extender = Output>, | |
O1: ExtendInto<Item = ExtendItem, Extender = Output>, | |
O2: ExtendInto<Item = ExtendItem, Extender = Output>, | |
<I as Input>::Item: AsChar, | |
F: Parser<I, Output = O1, Error = Error>, | |
G: Parser<I, Output = O2, Error = Error>, | |
Error: ParseError<I>, | |
{ | |
let mut parser = super::escaped_transform(normal, control_char, transform); | |
move |i: I| parser.process::<OutputM<Emit, Emit, Streaming>>(i) | |
} | |
} | |
#[cfg(test)] | |
mod tests { | |
// Tests adapted from https://github.com/rust-bakery/nom/issues/1679 | |
use super::complete::escaped_transform; | |
use nom::{ | |
branch::alt, | |
bytes::complete::{is_not, tag}, | |
character::streaming::alpha1, | |
combinator::{map, value}, | |
error::{Error, ErrorKind}, | |
error_node_position, error_position, IResult, Parser, | |
}; | |
const FEND: u8 = 0xC0; | |
const FESC: u8 = 0xDB; | |
const TFEND: u8 = 0xDC; | |
const TFESC: u8 = 0xDD; | |
fn unescape(input: &[u8]) -> IResult<&[u8], Vec<u8>> { | |
escaped_transform( | |
is_not([FESC].as_slice()), | |
FESC, | |
alt(( | |
value(&[FEND][..], tag(&[TFEND][..])), | |
value(&[FESC][..], tag(&[TFESC][..])), | |
)), | |
)(input) | |
} | |
#[test] | |
fn try_fesc() { | |
let res = unescape(&[0x61, 0x62, FESC, TFEND, 0x63, 0x64, 0x65]); | |
assert_eq!(res, Ok((&[][..], vec![0x61, 0x62, FEND, 0x63, 0x64, 0x65]))) | |
} | |
#[test] | |
fn try_fesczerozero() { | |
let input = [0x61, FESC, 0x00, TFEND, 0x63, 0x64]; | |
let res = unescape(&input); | |
// 0xDB as char internally gets turned into 0xDB00, it seems | |
// this test case is *not* desired behavior, but I put it here | |
// for insight into the implementation details | |
// assert_eq!(res, Ok((&[][..], vec![0x61, FEND, 0x63, 0x64]))); | |
// With the bug fix, this now errors out after hitting the null byte. | |
assert_eq!( | |
res, | |
Err(nom::Err::Error(Error::new( | |
&input[2..], | |
nom::error::ErrorKind::Tag | |
))), | |
); | |
} | |
#[test] | |
fn try_noesc() { | |
let res = unescape(&[0x61, 0x62, 0x63]); | |
assert_eq!(res, Ok((&[][..], vec![0x61, 0x62, 0x63]))); | |
} | |
// From https://github.com/rust-bakery/nom/blob/main/src/bytes/tests.rs | |
#[test] | |
fn escape_transform() { | |
fn to_s(i: Vec<u8>) -> String { | |
String::from_utf8_lossy(&i).into_owned() | |
} | |
fn esc(i: &[u8]) -> IResult<&[u8], String> { | |
map( | |
escaped_transform( | |
alpha1, | |
b'\\', | |
alt(( | |
value(&b"\\"[..], tag(b"\\".as_slice())), | |
value(&b"\""[..], tag(b"\"".as_slice())), | |
value(&b"\n"[..], tag(b"n".as_slice())), | |
)), | |
), | |
to_s, | |
) | |
.parse(i) | |
} | |
assert_eq!(esc(&b"abcd;"[..]), Ok((&b";"[..], String::from("abcd")))); | |
assert_eq!( | |
esc(&b"ab\\\"cd;"[..]), | |
Ok((&b";"[..], String::from("ab\"cd"))) | |
); | |
assert_eq!( | |
esc(&b"\\\"abcd;"[..]), | |
Ok((&b";"[..], String::from("\"abcd"))) | |
); | |
assert_eq!(esc(&b"\\n;"[..]), Ok((&b";"[..], String::from("\n")))); | |
assert_eq!( | |
esc(&b"ab\\\"12"[..]), | |
Ok((&b"12"[..], String::from("ab\""))) | |
); | |
assert_eq!( | |
esc(&b"AB\\"[..]), | |
Err(nom::Err::Error(error_position!( | |
&b"\\"[..], | |
ErrorKind::EscapedTransform | |
))) | |
); | |
assert_eq!( | |
esc(&b"AB\\A"[..]), | |
Err(nom::Err::Error(error_node_position!( | |
&b"AB\\A"[..], | |
ErrorKind::EscapedTransform, | |
error_position!(&b"A"[..], ErrorKind::Tag) | |
))) | |
); | |
fn esc2(i: &[u8]) -> IResult<&[u8], String> { | |
map( | |
escaped_transform( | |
alpha1, | |
b'&', | |
alt(( | |
value("è".as_bytes(), tag(b"egrave;".as_slice())), | |
value("à".as_bytes(), tag(b"agrave;".as_slice())), | |
)), | |
), | |
to_s, | |
) | |
.parse(i) | |
} | |
assert_eq!( | |
esc2(&b"abèDEF;"[..]), | |
Ok((&b";"[..], String::from("abèDEF"))) | |
); | |
assert_eq!( | |
esc2(&b"abèDàEF;"[..]), | |
Ok((&b";"[..], String::from("abèDàEF"))) | |
); | |
} | |
#[test] | |
fn escape_transform_str() { | |
fn esc(i: &str) -> IResult<&str, String> { | |
escaped_transform( | |
alpha1, | |
'\\', | |
alt(( | |
value("\\", tag("\\")), | |
value("\"", tag("\"")), | |
value("\n", tag("n")), | |
)), | |
)(i) | |
} | |
assert_eq!(esc("abcd;"), Ok((";", String::from("abcd")))); | |
assert_eq!(esc("ab\\\"cd;"), Ok((";", String::from("ab\"cd")))); | |
assert_eq!(esc("\\\"abcd;"), Ok((";", String::from("\"abcd")))); | |
assert_eq!(esc("\\n;"), Ok((";", String::from("\n")))); | |
assert_eq!(esc("ab\\\"12"), Ok(("12", String::from("ab\"")))); | |
assert_eq!( | |
esc("AB\\"), | |
Err(nom::Err::Error(error_position!( | |
"\\", | |
ErrorKind::EscapedTransform | |
))) | |
); | |
assert_eq!( | |
esc("AB\\A"), | |
Err(nom::Err::Error(error_node_position!( | |
"AB\\A", | |
ErrorKind::EscapedTransform, | |
error_position!("A", ErrorKind::Tag) | |
))) | |
); | |
fn esc2(i: &str) -> IResult<&str, String> { | |
escaped_transform( | |
alpha1, | |
'&', | |
alt((value("è", tag("egrave;")), value("à", tag("agrave;")))), | |
)(i) | |
} | |
assert_eq!(esc2("abèDEF;"), Ok((";", String::from("abèDEF")))); | |
assert_eq!( | |
esc2("abèDàEF;"), | |
Ok((";", String::from("abèDàEF"))) | |
); | |
fn esc3(i: &str) -> IResult<&str, String> { | |
escaped_transform( | |
alpha1, | |
'␛', | |
alt((value("\0", tag("0")), value("\n", tag("n")))), | |
)(i) | |
} | |
assert_eq!(esc3("a␛0bc␛n"), Ok(("", String::from("a\0bc\n")))); | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment