Last active
February 3, 2016 10:48
-
-
Save ElectricCoffee/fe655e0990d49cdb162d to your computer and use it in GitHub Desktop.
Primitive tokenizer in Scala
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| package eu.wausoft.lex | |
| import java.lang.{String => JString} // renames the built-in string to JString to avoid token clashing | |
| trait Token | |
| case class String(value: JString) extends Token | |
| case class Number(value: JString) extends Token | |
| case class Atom(value: JString) extends Token | |
| case class Keyword(value: JString) extends Token | |
| case object Unknown extends Token | |
| object Lexer { | |
| // triple-strings """ are "block-strings", they also ignore escape characters like @"" in C# | |
| private val string = """(".*")""".r // matches strings with any character between the "" | |
| private val number = """(\d*.\d+|\d*)""".r // matches the numbers in the following formats: 123 123.456 .123 | |
| private val atom = """('[\w\-]+)""".r // matches expressions like 'hello 'foo-bar 'baz_quux | |
| private val keywd = """(:[\w\-]+)""".r // matches expressions like :hello :foo-bar :baz_quux | |
| // (\s+(?![^"]*")) matches only on whitespace outside of a "" pair, allowing strings with spaces inside | |
| def tokenize(str: JString): List[Token] = str.split("""(\s+(?![^"]*"))""").map { | |
| case string(v) => String(v) | |
| case number(v) => Number(v) | |
| case atom(v) => Atom(v) | |
| case keywd(v) => Keyword(v) | |
| case _ => Unknown | |
| }.toList | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment