Last active
December 19, 2015 21:32
-
-
Save maciej/f8da7d2db8b686cdd76e to your computer and use it in GitHub Desktop.
Parboiled2 CSV parser
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* based on comments in https://github.com/sirthias/parboiled2/issues/61 */ | |
case class Parboiled2CsvParser(input: ParserInput, delimeter: String) extends Parser { | |
def DQUOTE = '"' | |
def DELIMITER_TOKEN = rule(capture(delimeter)) | |
def DQUOTE2 = rule("\"\"" ~ push("\"")) | |
def CRLF = rule(capture("\n\r" | "\n")) | |
def NON_CAPTURING_CRLF = rule("\n\r" | "\n") | |
val delims = s"$delimeter\r\n" + DQUOTE | |
def TXT = rule(capture(!anyOf(delims) ~ ANY)) | |
val WHITESPACE = CharPredicate(" \t") | |
def SPACES: Rule0 = rule(oneOrMore(WHITESPACE)) | |
def escaped = rule(optional(SPACES) ~ | |
DQUOTE ~ (zeroOrMore(DELIMITER_TOKEN | TXT | CRLF | DQUOTE2) ~ DQUOTE ~ | |
optional(SPACES)) ~> (_.mkString(""))) | |
def nonEscaped = rule(zeroOrMore(TXT | capture(DQUOTE)) ~> (_.mkString(""))) | |
def field = rule(escaped | nonEscaped) | |
def row: Rule1[Seq[String]] = rule(oneOrMore(field).separatedBy(delimeter)) | |
def file = rule(zeroOrMore(row).separatedBy(NON_CAPTURING_CRLF)) | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
def nonEscaped = rule(zeroOrMore(TXT | capture(DQUOTE)) -> (.mkString("")))
should it not be
def nonEscaped = rule(zeroOrMore(TXT | capture(DQUOTE)) ~> (.mkString("")))