Skip to content

Instantly share code, notes, and snippets.

@ElectricCoffee
Last active February 3, 2016 10:48
Show Gist options
  • Select an option

  • Save ElectricCoffee/fe655e0990d49cdb162d to your computer and use it in GitHub Desktop.

Select an option

Save ElectricCoffee/fe655e0990d49cdb162d to your computer and use it in GitHub Desktop.
Primitive tokenizer in Scala
package eu.wausoft.lex
import java.lang.{String => JString} // renames the built-in string to JString to avoid token clashing
trait Token
case class String(value: JString) extends Token
case class Number(value: JString) extends Token
case class Atom(value: JString) extends Token
case class Keyword(value: JString) extends Token
case object Unknown extends Token
object Lexer {
// triple-strings """ are "block-strings", they also ignore escape characters like @"" in C#
private val string = """(".*")""".r // matches strings with any character between the ""
private val number = """(\d*.\d+|\d*)""".r // matches the numbers in the following formats: 123 123.456 .123
private val atom = """('[\w\-]+)""".r // matches expressions like 'hello 'foo-bar 'baz_quux
private val keywd = """(:[\w\-]+)""".r // matches expressions like :hello :foo-bar :baz_quux
// (\s+(?![^"]*")) matches only on whitespace outside of a "" pair, allowing strings with spaces inside
def tokenize(str: JString): List[Token] = str.split("""(\s+(?![^"]*"))""").map {
case string(v) => String(v)
case number(v) => Number(v)
case atom(v) => Atom(v)
case keywd(v) => Keyword(v)
case _ => Unknown
}.toList
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment