Last active
October 5, 2023 18:40
-
-
Save benaryorg/9d4f2aec58c35a06adf3d61bfd9eec7a to your computer and use it in GitHub Desktop.
Markdown parsing (link and parenthesis)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// see fediverse thread: https://astolfo.social/notes/9kgqfhd4f9gqlh39 | |
// | |
// Written by @benaryorg with the idea of enforcing balanced parens in links, following guideline: | |
// | |
// I think the easiest, best, and most incorrect solution would be to enforce balanced parens in the links. | |
use :: | |
{ | |
nom:: | |
{ | |
IResult, | |
Finish, | |
bytes::complete:: | |
{ | |
tag, | |
}, | |
character::complete:: | |
{ | |
anychar, | |
char, | |
one_of, | |
none_of, | |
alphanumeric1, | |
}, | |
combinator:: | |
{ | |
complete, | |
map, | |
recognize, | |
opt, | |
}, | |
multi:: | |
{ | |
many0, | |
many1, | |
}, | |
branch:: | |
{ | |
alt, | |
}, | |
sequence:: | |
{ | |
delimited, | |
}, | |
}, | |
std:: | |
{ | |
io::stdin, | |
}, | |
}; | |
#[derive(Debug, Hash, Clone, PartialEq, Eq, PartialOrd, Ord)] | |
enum Markdown | |
{ | |
Char(char), | |
Link | |
{ | |
text: String, | |
url: String, | |
} | |
} | |
fn markdown_link_ref_balanced_parens(input: &str) -> IResult<&str, ()> | |
{ | |
// does not support: | |
// unicode | |
let (input, _) = many1(alt( | |
( alphanumeric1 | |
, recognize(one_of("-._~:/?#[]@!$&'*+,;%= ")) | |
, recognize(delimited(char('('), opt(markdown_link_ref_balanced_parens), char(')'))) | |
)))(input)?; | |
Ok((input, ())) | |
} | |
fn markdown_link_ref(input: &str) -> IResult<&str, ()> | |
{ | |
// schema | |
let (input, _) = alt((tag("https://"), tag("http://")))(input)?; | |
// domain | |
// does not support IDN | |
// does not validate for multiple of: '@', ':', etc. | |
let (input, _) = many1(alt((alphanumeric1, recognize(one_of("-.:@")))))(input)?; | |
// path | |
let (input, _) = opt(markdown_link_ref_balanced_parens)(input)?; | |
Ok((input, ())) | |
} | |
fn markdown_link(input: &str) -> IResult<&str, Markdown> | |
{ | |
let (input, _) = char('[')(input)?; | |
let (input, text) = recognize(many1(none_of("\\]")))(input)?; | |
let (input, _) = char(']')(input)?; | |
let (input, _) = char('(')(input)?; | |
let (input, url) = recognize(markdown_link_ref)(input)?; | |
let (input, _) = char(')')(input)?; | |
Ok((input, Markdown::Link | |
{ | |
text: text.into(), | |
url: url.into(), | |
})) | |
} | |
fn markdown_escape(input: &str) -> IResult<&str, Markdown> | |
{ | |
let (input, _) = char('\\')(input)?; | |
map(anychar, Markdown::Char)(input) | |
} | |
fn markdown_element(input: &str) -> IResult<&str, Markdown> | |
{ | |
alt( | |
( markdown_escape | |
, markdown_link | |
// always fall backto reading raw chars, this way the parser cannot hard-fail it will just produce plaintext | |
, |input| map(anychar, Markdown::Char)(input) | |
) | |
)(input) | |
} | |
fn markdown(input: &str) -> IResult<&str, Vec<Markdown>> | |
{ | |
many0(markdown_element)(input) | |
} | |
fn main() | |
{ | |
for line in stdin().lines().take_while(Result::is_ok).map(Result::unwrap) | |
{ | |
match complete(markdown)(&line).finish() | |
{ | |
Ok((_, out)) => println!("{:?}", out), | |
Err(err) => eprintln!("{:?}", err), | |
} | |
} | |
} | |
#[cfg(test)] | |
mod test | |
{ | |
#[test] | |
fn parse_markdown() | |
{ | |
use crate::{markdown, Markdown}; | |
// no parens in link with parens around text | |
assert_eq!(markdown("foo (see: [something](https://en.wikipedia.org/wiki/Text)) bar"), Ok(("", | |
vec! | |
// opening text, parens, and more text | |
[ Markdown::Char('f'), Markdown::Char('o'), Markdown::Char('o'), Markdown::Char(' '), Markdown::Char('('), Markdown::Char('s'), Markdown::Char('e'), Markdown::Char('e'), Markdown::Char(':'), Markdown::Char(' ') | |
// link (no parens) | |
, Markdown::Link { text: "something".into(), url: "https://en.wikipedia.org/wiki/Text".into(), } | |
// closing parens, and following text | |
, Markdown::Char(')'), Markdown::Char(' '), Markdown::Char('b'), Markdown::Char('a'), Markdown::Char('r') | |
]))); | |
// balanced parens in link with parens around text | |
assert_eq!(markdown("foo (see: [something](https://en.wikipedia.org/wiki/Text_(literary_theory))) bar"), Ok(("", | |
vec! | |
// opening text, parens, and more text | |
[ Markdown::Char('f'), Markdown::Char('o'), Markdown::Char('o'), Markdown::Char(' '), Markdown::Char('('), Markdown::Char('s'), Markdown::Char('e'), Markdown::Char('e'), Markdown::Char(':'), Markdown::Char(' ') | |
// link (balanced parens) | |
, Markdown::Link { text: "something".into(), url: "https://en.wikipedia.org/wiki/Text_(literary_theory)".into(), } | |
// closing parens, and following text | |
, Markdown::Char(')'), Markdown::Char(' '), Markdown::Char('b'), Markdown::Char('a'), Markdown::Char('r') | |
]))); | |
// trailing parens, balanced and recognized | |
assert_eq!(markdown("foo (see: [something](https://en.wikipedia.org/wiki/Text()) bar"), Ok(("", | |
vec! | |
// opening text, parens, and more text | |
[ Markdown::Char('f'), Markdown::Char('o'), Markdown::Char('o'), Markdown::Char(' '), Markdown::Char('('), Markdown::Char('s'), Markdown::Char('e'), Markdown::Char('e'), Markdown::Char(':'), Markdown::Char(' ') | |
// link (balanced parens) | |
, Markdown::Link { text: "something".into(), url: "https://en.wikipedia.org/wiki/Text()".into(), } | |
// closing parens, and following text | |
, Markdown::Char(' '), Markdown::Char('b'), Markdown::Char('a'), Markdown::Char('r') | |
]))); | |
// trailing parens, unbalanced and ignored | |
assert_eq!(markdown("[something](https://en.wikipedia.org/wiki/Halting_problem))"), Ok(("", | |
vec! | |
// link (balanced parens, meaning no parens) | |
[ Markdown::Link { text: "something".into(), url: "https://en.wikipedia.org/wiki/Halting_problem".into(), } | |
// closing parens which doesn't match the link | |
, Markdown::Char(')') | |
]))); | |
// spaces in link | |
assert_eq!(markdown("[something](https://en.wikipedia.org/wiki/Among Us)"), Ok(("", | |
vec! | |
// link (balanced parens, meaning no parens), does include space | |
[ Markdown::Link { text: "something".into(), url: "https://en.wikipedia.org/wiki/Among Us".into(), } | |
]))); | |
// spaces and unbalanced parens | |
assert_eq!(markdown("[something](https://en.wikipedia.org/wiki/Amogus) Us)"), Ok(("", | |
vec! | |
// link (balanced parens, meaning no parens), does not include space | |
[ Markdown::Link { text: "something".into(), url: "https://en.wikipedia.org/wiki/Amogus".into(), } | |
// trailing " Us)" that doesn't belong to the link | |
, Markdown::Char(' '), Markdown::Char('U'), Markdown::Char('s'), Markdown::Char(')') | |
]))); | |
// parens and spaces | |
assert_eq!(markdown("[something](https://en.wikipedia.org/wiki/Amogus (meme)) Us)"), Ok(("", | |
vec! | |
// link (balanced parens, meaning it contains the "(meme)"), does include space | |
[ Markdown::Link { text: "something".into(), url: "https://en.wikipedia.org/wiki/Amogus (meme)".into(), } | |
// trailing " Us)" that doesn't belong to the link | |
, Markdown::Char(' '), Markdown::Char('U'), Markdown::Char('s'), Markdown::Char(')') | |
]))); | |
// the failing `(` Wikipedia article | |
assert_eq!(markdown("[something](https://en.wikipedia.org/wiki/()"), Ok(("", | |
vec! | |
// turns into plaintext, the link name first | |
[ Markdown::Char('['), Markdown::Char('s'), Markdown::Char('o'), Markdown::Char('m'), Markdown::Char('e'), Markdown::Char('t'), Markdown::Char('h'), Markdown::Char('i'), Markdown::Char('n'), Markdown::Char('g'), Markdown::Char(']') | |
// link ref next | |
, Markdown::Char('('), Markdown::Char('h'), Markdown::Char('t'), Markdown::Char('t'), Markdown::Char('p'), Markdown::Char('s'), Markdown::Char(':'), Markdown::Char('/'), Markdown::Char('/'), Markdown::Char('e'), Markdown::Char('n'), Markdown::Char('.'), Markdown::Char('w'), Markdown::Char('i'), Markdown::Char('k'), Markdown::Char('i'), Markdown::Char('p'), Markdown::Char('e'), Markdown::Char('d'), Markdown::Char('i'), Markdown::Char('a'), Markdown::Char('.'), Markdown::Char('o'), Markdown::Char('r'), Markdown::Char('g'), Markdown::Char('/'), Markdown::Char('w'), Markdown::Char('i'), Markdown::Char('k'), Markdown::Char('i'), Markdown::Char('/'), Markdown::Char('('), Markdown::Char(')'), | |
]))); | |
} | |
} | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment