Skip to content

Instantly share code, notes, and snippets.

@micolous
Last active August 31, 2025 08:18
Show Gist options
  • Save micolous/e8ab00d741bc27fa3b1657ab9ce41199 to your computer and use it in GitHub Desktop.
Save micolous/e8ab00d741bc27fa3b1657ab9ce41199 to your computer and use it in GitHub Desktop.
//! Workaround for https://github.com/rust-bakery/nom/issues/1679
use core::marker::PhantomData;
use nom::{
error::{ErrorKind, ParseError},
AsChar, Err, ExtendInto, Input, IsStreaming, Mode, Needed, Offset, OutputMode, Parser,
};
/// Matches a byte string with escaped characters.
///
/// * The first argument matches the normal characters (it must not match the control character)
/// * The second argument is the control character (like `\` in most languages)
/// * The third argument matches the escaped characters and transforms them
///
/// As an example, the chain `abc\tdef` could be `abc def` (it also consumes the control character)
///
/// ```compile_fail
/// # use nom::{Err, error::ErrorKind, Needed, IResult};
/// # use std::str::from_utf8;
/// use nom::bytes::streaming::{escaped_transform, tag};
/// use nom::character::streaming::alpha1;
/// use nom::branch::alt;
/// use nom::combinator::value;
///
/// fn parser(input: &[u8]) -> IResult<&[u8], Vec<u8>> {
/// escaped_transform(
/// alpha1,
/// b'\\',
/// alt((
/// value(b"\\", tag(b"\\".as_slice())),
/// value(b"\"", tag(b"\"".as_slice())),
/// value(b"\n", tag(b"n".as_slice())),
/// ))
/// )(input)
/// }
///
/// assert_eq!(parser(b"ab\\\"cd\""), Ok((b"\"".as_slice(), b"ab\"cd".to_vec())));
/// ```
///
/// This is a modified version of [nom's `escaped_transform`][nom::bytes::escaped_transform]
/// which takes an `control_character` as `u8`, rather than `char`.
pub fn escaped_transform<I, Error, F, G, ExtendItem, Output>(
normal: F,
control_char: impl AsChar,
transform: G,
) -> impl Parser<I, Output = Output, Error = Error>
where
I: Clone + Offset + Input,
I: ExtendInto<Item = ExtendItem, Extender = Output>,
<F as Parser<I>>::Output: ExtendInto<Item = ExtendItem, Extender = Output>,
<G as Parser<I>>::Output: ExtendInto<Item = ExtendItem, Extender = Output>,
<I as Input>::Item: AsChar,
F: Parser<I, Error = Error>,
G: Parser<I, Error = Error>,
Error: ParseError<I>,
{
EscapedTransform {
normal,
control_char,
transform,
e: PhantomData,
extend: PhantomData,
o: PhantomData,
}
}
/// Parser implementation for [escaped_transform]
pub struct EscapedTransform<F, G, E, ExtendItem, Output, C> {
normal: F,
transform: G,
control_char: C,
e: PhantomData<E>,
extend: PhantomData<ExtendItem>,
o: PhantomData<Output>,
}
impl<I, Error: ParseError<I>, F, G, ExtendItem, Output, C> Parser<I>
for EscapedTransform<F, G, Error, ExtendItem, Output, C>
where
I: Clone + Offset + Input,
I: ExtendInto<Item = ExtendItem, Extender = Output>,
<F as Parser<I>>::Output: ExtendInto<Item = ExtendItem, Extender = Output>,
<G as Parser<I>>::Output: ExtendInto<Item = ExtendItem, Extender = Output>,
<I as Input>::Item: AsChar,
C: AsChar,
F: Parser<I, Error = Error>,
G: Parser<I, Error = Error>,
Error: ParseError<I>,
{
type Output = Output;
type Error = Error;
fn process<OM: OutputMode>(
&mut self,
input: I,
) -> nom::PResult<OM, I, Self::Output, Self::Error> {
let mut index = 0;
let mut res = OM::Output::bind(|| input.new_builder());
while index < input.input_len() {
let current_len = input.input_len();
let remainder = input.take_from(index);
match self.normal.process::<OM>(remainder.clone()) {
Ok((i2, o)) => {
res = OM::Output::combine(o, res, |o, mut res| {
o.extend_into(&mut res);
res
});
if i2.input_len() == 0 {
if OM::Incomplete::is_streaming() {
return Err(Err::Incomplete(Needed::Unknown));
} else {
let index = input.input_len();
return Ok((input.take_from(index), res));
}
} else if i2.input_len() == current_len {
return Ok((remainder, res));
} else {
index = input.offset(&i2);
}
}
Err(Err::Error(_)) => {
// unwrap() should be safe here since index < $i.input_len()
if remainder.iter_elements().next().unwrap().as_char()
== self.control_char.as_char()
{
let next = index + self.control_char.len();
let input_len = input.input_len();
if next >= input_len {
if OM::Incomplete::is_streaming() {
return Err(Err::Incomplete(Needed::Unknown));
} else {
return Err(Err::Error(OM::Error::bind(|| {
Error::from_error_kind(remainder, ErrorKind::EscapedTransform)
})));
}
} else {
match self.transform.process::<OM>(input.take_from(next)) {
Ok((i2, o)) => {
res = OM::Output::combine(o, res, |o, mut res| {
o.extend_into(&mut res);
res
});
if i2.input_len() == 0 {
if OM::Incomplete::is_streaming() {
return Err(Err::Incomplete(Needed::Unknown));
} else {
return Ok((input.take_from(input.input_len()), res));
}
} else {
index = input.offset(&i2);
}
}
Err(Err::Error(e)) => return Err(Err::Error(e)),
Err(Err::Failure(e)) => {
return Err(Err::Failure(e));
}
Err(Err::Incomplete(i)) => {
return Err(Err::Incomplete(i));
}
}
}
} else {
if index == 0 {
return Err(Err::Error(OM::Error::bind(|| {
Error::from_error_kind(remainder, ErrorKind::EscapedTransform)
})));
}
return Ok((remainder, res));
}
}
Err(Err::Failure(e)) => {
return Err(Err::Failure(e));
}
Err(Err::Incomplete(i)) => {
return Err(Err::Incomplete(i));
}
}
}
if OM::Incomplete::is_streaming() {
Err(Err::Incomplete(Needed::Unknown))
} else {
Ok((input.take_from(index), res))
}
}
}
pub mod complete {
use super::*;
use nom::{Complete, Emit, IResult, OutputM};
pub fn escaped_transform<I, Error, F, G, O1, O2, ExtendItem, Output>(
normal: F,
control_char: impl AsChar,
transform: G,
) -> impl FnMut(I) -> IResult<I, Output, Error>
where
I: Clone + Offset + Input,
I: ExtendInto<Item = ExtendItem, Extender = Output>,
O1: ExtendInto<Item = ExtendItem, Extender = Output>,
O2: ExtendInto<Item = ExtendItem, Extender = Output>,
<I as Input>::Item: AsChar,
F: Parser<I, Output = O1, Error = Error>,
G: Parser<I, Output = O2, Error = Error>,
Error: ParseError<I>,
{
let mut parser = super::escaped_transform(normal, control_char, transform);
move |i: I| parser.process::<OutputM<Emit, Emit, Complete>>(i)
}
}
pub mod streaming {
use super::*;
use nom::{Emit, IResult, OutputM, Streaming};
pub fn escaped_transform<I, Error, F, G, O1, O2, ExtendItem, Output>(
normal: F,
control_char: impl AsChar,
transform: G,
) -> impl FnMut(I) -> IResult<I, Output, Error>
where
I: Clone + Offset + Input,
I: ExtendInto<Item = ExtendItem, Extender = Output>,
O1: ExtendInto<Item = ExtendItem, Extender = Output>,
O2: ExtendInto<Item = ExtendItem, Extender = Output>,
<I as Input>::Item: AsChar,
F: Parser<I, Output = O1, Error = Error>,
G: Parser<I, Output = O2, Error = Error>,
Error: ParseError<I>,
{
let mut parser = super::escaped_transform(normal, control_char, transform);
move |i: I| parser.process::<OutputM<Emit, Emit, Streaming>>(i)
}
}
#[cfg(test)]
mod tests {
// Tests adapted from https://github.com/rust-bakery/nom/issues/1679
use super::complete::escaped_transform;
use nom::{
branch::alt,
bytes::complete::{is_not, tag},
character::streaming::alpha1,
combinator::{map, value},
error::{Error, ErrorKind},
error_node_position, error_position, IResult, Parser,
};
const FEND: u8 = 0xC0;
const FESC: u8 = 0xDB;
const TFEND: u8 = 0xDC;
const TFESC: u8 = 0xDD;
fn unescape(input: &[u8]) -> IResult<&[u8], Vec<u8>> {
escaped_transform(
is_not([FESC].as_slice()),
FESC,
alt((
value(&[FEND][..], tag(&[TFEND][..])),
value(&[FESC][..], tag(&[TFESC][..])),
)),
)(input)
}
#[test]
fn try_fesc() {
let res = unescape(&[0x61, 0x62, FESC, TFEND, 0x63, 0x64, 0x65]);
assert_eq!(res, Ok((&[][..], vec![0x61, 0x62, FEND, 0x63, 0x64, 0x65])))
}
#[test]
fn try_fesczerozero() {
let input = [0x61, FESC, 0x00, TFEND, 0x63, 0x64];
let res = unescape(&input);
// 0xDB as char internally gets turned into 0xDB00, it seems
// this test case is *not* desired behavior, but I put it here
// for insight into the implementation details
// assert_eq!(res, Ok((&[][..], vec![0x61, FEND, 0x63, 0x64])));
// With the bug fix, this now errors out after hitting the null byte.
assert_eq!(
res,
Err(nom::Err::Error(Error::new(
&input[2..],
nom::error::ErrorKind::Tag
))),
);
}
#[test]
fn try_noesc() {
let res = unescape(&[0x61, 0x62, 0x63]);
assert_eq!(res, Ok((&[][..], vec![0x61, 0x62, 0x63])));
}
// From https://github.com/rust-bakery/nom/blob/main/src/bytes/tests.rs
#[test]
fn escape_transform() {
fn to_s(i: Vec<u8>) -> String {
String::from_utf8_lossy(&i).into_owned()
}
fn esc(i: &[u8]) -> IResult<&[u8], String> {
map(
escaped_transform(
alpha1,
b'\\',
alt((
value(&b"\\"[..], tag(b"\\".as_slice())),
value(&b"\""[..], tag(b"\"".as_slice())),
value(&b"\n"[..], tag(b"n".as_slice())),
)),
),
to_s,
)
.parse(i)
}
assert_eq!(esc(&b"abcd;"[..]), Ok((&b";"[..], String::from("abcd"))));
assert_eq!(
esc(&b"ab\\\"cd;"[..]),
Ok((&b";"[..], String::from("ab\"cd")))
);
assert_eq!(
esc(&b"\\\"abcd;"[..]),
Ok((&b";"[..], String::from("\"abcd")))
);
assert_eq!(esc(&b"\\n;"[..]), Ok((&b";"[..], String::from("\n"))));
assert_eq!(
esc(&b"ab\\\"12"[..]),
Ok((&b"12"[..], String::from("ab\"")))
);
assert_eq!(
esc(&b"AB\\"[..]),
Err(nom::Err::Error(error_position!(
&b"\\"[..],
ErrorKind::EscapedTransform
)))
);
assert_eq!(
esc(&b"AB\\A"[..]),
Err(nom::Err::Error(error_node_position!(
&b"AB\\A"[..],
ErrorKind::EscapedTransform,
error_position!(&b"A"[..], ErrorKind::Tag)
)))
);
fn esc2(i: &[u8]) -> IResult<&[u8], String> {
map(
escaped_transform(
alpha1,
b'&',
alt((
value("è".as_bytes(), tag(b"egrave;".as_slice())),
value("à".as_bytes(), tag(b"agrave;".as_slice())),
)),
),
to_s,
)
.parse(i)
}
assert_eq!(
esc2(&b"ab&egrave;DEF;"[..]),
Ok((&b";"[..], String::from("abèDEF")))
);
assert_eq!(
esc2(&b"ab&egrave;D&agrave;EF;"[..]),
Ok((&b";"[..], String::from("abèDàEF")))
);
}
#[test]
fn escape_transform_str() {
fn esc(i: &str) -> IResult<&str, String> {
escaped_transform(
alpha1,
'\\',
alt((
value("\\", tag("\\")),
value("\"", tag("\"")),
value("\n", tag("n")),
)),
)(i)
}
assert_eq!(esc("abcd;"), Ok((";", String::from("abcd"))));
assert_eq!(esc("ab\\\"cd;"), Ok((";", String::from("ab\"cd"))));
assert_eq!(esc("\\\"abcd;"), Ok((";", String::from("\"abcd"))));
assert_eq!(esc("\\n;"), Ok((";", String::from("\n"))));
assert_eq!(esc("ab\\\"12"), Ok(("12", String::from("ab\""))));
assert_eq!(
esc("AB\\"),
Err(nom::Err::Error(error_position!(
"\\",
ErrorKind::EscapedTransform
)))
);
assert_eq!(
esc("AB\\A"),
Err(nom::Err::Error(error_node_position!(
"AB\\A",
ErrorKind::EscapedTransform,
error_position!("A", ErrorKind::Tag)
)))
);
fn esc2(i: &str) -> IResult<&str, String> {
escaped_transform(
alpha1,
'&',
alt((value("è", tag("egrave;")), value("à", tag("agrave;")))),
)(i)
}
assert_eq!(esc2("ab&egrave;DEF;"), Ok((";", String::from("abèDEF"))));
assert_eq!(
esc2("ab&egrave;D&agrave;EF;"),
Ok((";", String::from("abèDàEF")))
);
fn esc3(i: &str) -> IResult<&str, String> {
escaped_transform(
alpha1,
'␛',
alt((value("\0", tag("0")), value("\n", tag("n")))),
)(i)
}
assert_eq!(esc3("a␛0bc␛n"), Ok(("", String::from("a\0bc\n"))));
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment