Skip to content

Instantly share code, notes, and snippets.

@batakpout
Forked from ArtemGr/alternative, using regex
Created April 4, 2016 06:27
Show Gist options
  • Save batakpout/28539ac50857ecde54961ce037390e43 to your computer and use it in GitHub Desktop.
Save batakpout/28539ac50857ecde54961ce037390e43 to your computer and use it in GitHub Desktop.
CSV parser in Scala
val pattern = java.util.regex.Pattern.compile ("""(?xs) ("(.*?)"|) ; ("(.*?)"|) (?: \r?\n | \z ) """)
val matcher = pattern.matcher (input)
while (matcher.find) {
val col1 = matcher.group (2)
val col2 = matcher.group (4)
// ...
}
import scala.util.parsing.combinator._
import scala.util.parsing.combinator.syntactical._
object csvParser extends RegexParsers {
test // Perform unit-testing on first load.
// Turns off whitespace removal: line separators are an important part of the CSV format...
override def skipWhitespace = false
def CRLF = "\r\n" | "\n"
def EOF = "\\z".r
// Any number of columns, but no backtracking over accidental double-quotes.
def stringInQuotes = """(?xs) ".*?" |""".r ^^ {case qstr => if (qstr.length != 0) qstr.substring (1, qstr.length - 1) else ""}
def line = stringInQuotes ~ ';' ~ stringInQuotes ~ (CRLF | EOF) ^^ {case col1 ~ _ ~ col2 ~ _ => col1 :: col2 :: Nil}
// Fixed number of columns, but backtracking over accidental double-quotes works.
def unquote (str: String) = str.substring (1, str.length - (str.charAt (str.length - 1) match {case ';'|'\r'|'\n' => 2; case _ => 1}))
def col1 = ("(?s)\".*?\";".r ^^ unquote _) | (";" ^^ (_ => ""))
def col2 = ("(?s)\".*?\"(\\r|\\n)?".r ^^ unquote _) | (("\r" | "\n" | EOF) ^^ (_ => ""))
def twoColumns = col1 ~ col2 ~ opt ("\n") ^^ {case v1 ~ v2 ~ _ => v1 :: v2 :: Nil}
def csv: Parser[List[List[String]]] = rep1 (twoColumns)
def unwrap[T] (result: ParseResult[T]) = result match {
case Success (data, _) => data
case f@Failure (message, _) => throw new Exception (f.toString)
case e@Error (message, _) => throw new Exception (e.toString)
}
def test = {
def check[T] (s: String, expect: AnyRef): AnyRef = {
val result = unwrap (parse (csv, s))
assert (result.toString == expect.toString, "expected: \n" + expect + "\n; got: \n" + result)
}
check (";", List (List ("", ""))) // One string with both columns absent.
check ("\"qq\nqq\";", List (List ("qq\nqq", "")))
check (";\" name1 \n name2 \"", List (List ("", " name1 \n name2 ")))
check ("\"qq\";\"zz\nzz\"", List (List ("qq", "zz\nzz")))
check ("\"qq\";\"zz\"\n", List (List ("qq", "zz")))
check (";\n;\n;", List (List ("", ""), List ("", ""), List ("", "")))
check ("\"qq\";\"zz\"\n\"qq\";\"zz\"\n\"qq\";\"zz\"", List (List ("qq", "zz"), List ("qq", "zz"), List ("qq", "zz")))
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment