Last active
February 2, 2021 11:54
-
-
Save kelvinmo/a349d30e8854b24813eea1ff3040a8c6 to your computer and use it in GitHub Desktop.
Loose-tight text format
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
//! The loose-tight text format (lttxt) | |
//! | |
//! Contains helpers to parse and write to files using this format. | |
// | |
//! # Overview | |
//! | |
//! The *loose-tight text format* is a loose, semi-structured text file format | |
//! for simple uses such as configuration files. The format is defined in the | |
//! [specification](#specification) below. | |
//! | |
//! # Reading and writing | |
//! | |
//! To read, use a reader that can return the input as individual lines (such as | |
//! [`std::io::BufReader`]). Call the [`get_tokens`] function to parse each line | |
//! into a set of tokens. | |
//! | |
//! To write, use the [`ltwriteln`] macro to convert a line of tokens into a string. | |
//! | |
//! # Specification | |
//! | |
//! * A lttxt file is dividend into *lines*, which are further divided into string *tokens*. | |
//! * Lines are delimited in the same way as [`str::lines`], i.e. either a newline | |
//! (`\n`) or a carriage return with a line feed (`\r\n`). | |
//! * Tokens are delimited by a SPACE character (unless the space character is in | |
//! a quoted token). | |
//! * Comments are denoted by the hash (`#`) character (unless it appears in a quoted | |
//! token). The hash character and all subsequent characters are ignored until the | |
//! end of the line. | |
//! * Tokens can be *quoted* or *unquoted*. | |
//! * Quoted tokens are surrounded by quotation marks `"`. They MAY contain special | |
//! characters (defined below). Apart from the SPACE and hash characters, special | |
//! characters MUST be escaped when appearing within a quoted token. In addition, | |
//! backslash character `\` MUST be escaped as `\\`. | |
//! * Unquoted tokens are not surrounded by quotation marks. They MUST NOT contain | |
//! special characters. | |
//! * *Special characters* are as follows, with the escape sequence in parentheses: | |
//! - SPACE | |
//! - hash | |
//! - tab (`\t`) | |
//! - newline (`\n`) | |
//! - carriage return (`\r`) | |
//! - quotation marks (`\"`) | |
//! * When writing a lttxt file, quoted tokens SHOULD only be used if the token contains | |
//! special characters. Otherwise, unquoted tokens SHOULD be used. | |
//! | |
//! [`get_tokens`]: ./fn.get_tokens.html | |
//! [`std::io::BufReader`]: https://doc.rust-lang.org/nightly/std/io/struct.BufReader.html | |
//! [`str::lines`]: https://doc.rust-lang.org/nightly/std/primitive.str.html#method.lines | |
//! [`ltwriteln`]: ./macro.ltwriteln!.html | |
use std::fmt; | |
/// The type returned when the input does not comform to the expected format. Use the | |
/// `Debug` implementation to generate detailed information. | |
#[derive(Debug)] | |
pub enum ParseError { | |
/// An unescaped quote is encountered when parsing a quoted token | |
UnescapedQuote(usize), | |
/// An quotation mark is encountered when parsing an unquoted token | |
UnexpectedQuote(usize), | |
/// End of line is encountered when parsing a quoted token | |
UnmatchedQuote, | |
/// An unknown escape sequence is encountered | |
UnexpectedEscapeSequence(char, usize) | |
} | |
impl fmt::Display for ParseError { | |
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { | |
match *self { | |
ParseError::UnescapedQuote(i) => write!(f, "Unescaped quote in quoted token at position {}", i), | |
ParseError::UnexpectedQuote(i) => write!(f, "Unexpected quote in unquoted token at position {}", i), | |
ParseError::UnmatchedQuote => write!(f, "End of line without closing quote"), | |
ParseError::UnexpectedEscapeSequence(c, i) => write!(f, "Unexpected escape sequence '\\{}' at position {}", c, i) | |
} | |
} | |
} | |
impl std::error::Error for ParseError { | |
fn description(&self) -> &str { | |
match *self { | |
ParseError::UnescapedQuote(_) => "Unescaped quote in quoted token", | |
ParseError::UnexpectedQuote(_) => "Unexpected quote in unquoted token", | |
ParseError::UnmatchedQuote => "End of line without closing quote", | |
ParseError::UnexpectedEscapeSequence(_, _) => "Unexpected escape sequence" | |
} | |
} | |
fn cause(&self) -> Option<&dyn std::error::Error> { | |
None | |
} | |
} | |
/// An iterator over tokens in a line. | |
/// | |
/// This is created by calling [`get_tokens`]. See the documentation for [`get_tokens`] | |
/// for further details. | |
/// | |
/// [`get_tokens`]: ./fn.get_tokens.html | |
pub struct Tokens<'a> { | |
s: &'a str, | |
pos: usize | |
} | |
impl Iterator for Tokens<'_> { | |
type Item = Result<String, ParseError>; | |
fn next(&mut self) -> Option<Result<String, ParseError>> { | |
if self.pos >= self.s.len() { | |
return None; | |
} | |
let mut token = String::new(); | |
let mut chars = self.s[self.pos..].chars().enumerate(); | |
let mut in_token = false; | |
let mut in_quote = false; | |
// Return None at the end | |
while let Some((i, c)) = chars.next() { | |
if c == '#' { | |
if in_quote { | |
token.push(c); | |
} else { | |
// Advance position so that it's none on next call | |
self.pos = self.s.len(); | |
if in_token { | |
return Some(Ok(token)); | |
} else { | |
return None; | |
} | |
} | |
} else if c == '"' { | |
if in_quote { | |
// Closing quote | |
match chars.next() { | |
None | Some((_, ' ')) | Some((_, '#')) => { | |
// Ok, return the token | |
self.pos += i + 2; | |
return Some(Ok(token)); | |
}, | |
Some((j, _)) => { | |
// Characters after quote | |
let result = Some(Err(ParseError::UnescapedQuote(self.pos + j))); | |
// Advance position so that it's none on next call | |
self.pos = self.s.len(); | |
return result; | |
} | |
}; | |
} else if in_token { | |
// Quote in the middle of unquoted token | |
let result = Some(Err(ParseError::UnexpectedQuote(self.pos + i))); | |
// Advance position so that it's none on next call | |
self.pos = self.s.len(); | |
return result; | |
} else { | |
in_token = true; | |
in_quote = true; | |
} | |
} else if c == '\\' { | |
if in_quote { | |
let (_, n) = chars.next().unwrap_or((i, c)); | |
match n { | |
'\\' | '\"' => token.push(n), | |
't' => token.push('\t'), | |
'n' => token.push('\n'), | |
'r' => token.push('\r'), | |
unexpected => { | |
let result = Some(Err(ParseError::UnexpectedEscapeSequence(unexpected, self.pos + i))); | |
self.pos = self.s.len(); | |
return result; | |
} | |
}; | |
} else { | |
token.push(c); | |
} | |
} else if c == ' ' { | |
if in_quote { | |
token.push(c); | |
} else if in_token { | |
self.pos += i + 1; | |
return Some(Ok(token)); | |
} | |
} else { | |
in_token = true; | |
token.push(c); | |
} | |
} | |
// Advance position so that it's none on next call | |
self.pos = self.s.len(); | |
if in_quote { | |
return Some(Err(ParseError::UnmatchedQuote)); | |
} | |
if in_token { | |
return Some(Ok(token)); | |
} | |
None | |
} | |
} | |
/// Parses a line in a lttxt file and returns an iterator over tokens in that | |
/// line. | |
/// | |
/// The iterator returned from this function will yield instances of | |
/// [`Result`]`<`[`String`]`, `[`ParseError`]`>`. A [`ParseError`] is returned if an | |
/// error was encountered while trying to parse the next token. | |
/// | |
/// [`Result`]: https://doc.rust-lang.org/nightly/std/result/enum.Result.html | |
/// [`String`]: https://doc.rust-lang.org/nightly/std/string/struct.String.html | |
/// [`ParseError`]: ./enum.ParseError.html | |
pub fn get_tokens(s: &str) -> Tokens { | |
Tokens { | |
s: s, | |
pos: 0 | |
} | |
} | |
/// Macro for printing an lttxt formatted line to the stanard output. | |
/// | |
/// See [`ltwriteln!`] for more information on the syntax. | |
/// | |
/// [`ltwriteln!`]: ./macro.ltwriteln!.html | |
#[macro_export] | |
macro_rules! ltprintln { | |
($($arg:expr,)*) => { println!("{}", vec![$($arg),*].into_iter().map(|token| { crate::lttxt::quote_token(token) }).collect::<Vec<String>>().join(" ")) }; | |
} | |
/// Macro for formatting a set of arguments into an lttxt formatted line. | |
/// | |
/// The first argument is the output buffer. | |
#[macro_export] | |
macro_rules! ltwriteln { | |
($dst:expr, $($arg:expr,)*) => { writeln!($dst, "{}", vec![$($arg),*].into_iter().map(|token| { crate::lttxt::quote_token(token) }).collect::<Vec<String>>().join(" ")) }; | |
} | |
/// Returns a quoted token if the string contains a special character, or an | |
/// unquoted token otherwise. | |
pub fn quote_token(s: &str) -> String { | |
let mut result = String::new(); | |
let quote = s.chars().any(|c| c == ' ' || c == '#' || c == '\t' || c == '\n' || c == '\r' || c == '"'); | |
if quote { result.push('"'); } | |
let mut chars = s.chars(); | |
while let Some(c) = chars.next() { | |
match c { | |
'\t' => result.push_str("\\t"), | |
'\n' => result.push_str("\\n"), | |
'\r' => result.push_str("\\r"), | |
'"' => result.push_str("\\\""), | |
c => result.push(c) | |
}; | |
} | |
if quote { result.push('"'); } | |
result | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment