Last active
July 5, 2016 15:24
-
-
Save retnuh/a58b1d03e5ebd927861a5c2e64dfce3c to your computer and use it in GitHub Desktop.
TokenFSM - a simple token-based FSM for efficient token matching from a lexicon, etc.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package tokenfsm | |
import scala.annotation.tailrec | |
import scala.collection.immutable.SortedSet | |
/** | |
* Created by retnuh on 04/07/2016. | |
*/ | |
object TokenFSM { | |
private implicit def lengthOrdering[T](implicit ordering: Ordering[T]) = new Ordering[Seq[T]] { | |
override def compare(x: Seq[T], y: Seq[T]): Int = { | |
val lc = x.lengthCompare(y.length) | |
if (lc == 0) { | |
val fc = ordering.compare(x.head, y.head) | |
if (fc == 0) | |
return compare(x.tail, y.tail) | |
return fc | |
} | |
return lc | |
} | |
} | |
def apply[T](states: Seq[Seq[T]])(implicit ordering: Ordering[T]) = { | |
val initialStates = states.foldLeft(Map.empty[T, SortedSet[Seq[T]]])((m, ts) => { | |
m + (ts.head -> (m.getOrElse(ts.head, SortedSet.empty[Seq[T]](lengthOrdering)) + ts)) | |
}) | |
new TokenFSM(initialStates) | |
} | |
} | |
class TokenFSM[T](initialStates: Map[T, SortedSet[Seq[T]]])(implicit ordering: Ordering[T]) { | |
import TokenFSM._ | |
@tailrec | |
private def matchEach(tokens: Seq[T], pos: Int, inProgress: MachineState[T]): Seq[(Seq[T], (Int, Int))] = { | |
if (tokens.isEmpty) { | |
println(inProgress) | |
println(inProgress.complete) | |
return inProgress.complete.collect({ case Matched(t, s, e) => (t, (s, e)) }) | |
} | |
println("=================") | |
val head = tokens.head | |
println(s"head: $head") | |
val dominant: MachineState[T] = initialStates.getOrElse(head, SortedSet.empty[Seq[T]](lengthOrdering)) | |
.foldRight(inProgress)((ts: Seq[T], dominant: MachineState[T]) => dominant.dominate(Matching(ts, 0, pos))) | |
println(s"dominant before update: $dominant") | |
println("------------------") | |
val updated: MachineState[T] = dominant.nextToken(head, pos) | |
println(s"updated: $updated") | |
matchEach(tokens.tail, pos + 1, updated) | |
} | |
def matches(tokens: Seq[T]): Seq[(Seq[T], (Int, Int))] = { | |
matchEach(tokens, 0, NonMatch[T]()) | |
} | |
} | |
sealed trait MachineState[T] { | |
def nextToken(token: T, offset: Int): MachineState[T] | |
def complete: Seq[MachineState[T]] | |
def dominate(subject: MachineState[T]): MachineState[T] = (this, subject) match { | |
case (NonMatch(), s) => s | |
case (t, NonMatch()) => t | |
case (DominatingMatching(d, os), ns) => println(s"$d already dominates $os now dominating $ns"); DominatingMatching(d, os.dominate(ns)) | |
case (t, s) => println(s"$t dominating $s"); DominatingMatching(t, s) | |
} | |
} | |
trait InProgress[T] extends MachineState[T] | |
case class DominatingMatching[T](dominant: MachineState[T], subjugated: MachineState[T]) extends InProgress[T] { | |
override def nextToken(token: T, offset: Int): MachineState[T] = { | |
val d = dominant.nextToken(token, offset) | |
val subj = subjugated.nextToken(token, offset) | |
d match { | |
// If we don't match, just fallback to our subjugated match | |
case NonMatch() => subj | |
// If the dominant has matched, it overrides any subjects that have matched and have <= end. | |
// We have to keep going, though, in cases where a subjugated match fails at a later point in time and returns | |
// a match that we dominate. So we just keep going until complete. | |
case _ => DominatingMatching(d, subj) | |
} | |
} | |
override def complete: Seq[MachineState[T]] = { | |
val d = dominant.complete | |
val subjs = subjugated.complete | |
d match { | |
// If the dominant has matched, it overrides any subjects that have matched and have <= end. | |
// We have to keep going, though, in cases where a subjugated match fails at a later point in time and returns | |
// a match that we dominate. So we just keep going until complete. | |
case Seq(x: Matched[T]) => x +: subjs.collect({ case s: Matched[T] if s.end > x.end => s }) | |
// If we haven't matched, just fallback to our subjugated match | |
case _ => subjs | |
} | |
} | |
} | |
case class Matching[T](tokens: Seq[T], pos: Int, start: Int) extends InProgress[T] { | |
override def nextToken(token: T, offset: Int): MachineState[T] = { | |
val nextPos = pos + 1 | |
if (tokens(pos) == token) { | |
if (nextPos == tokens.length) | |
Matched(tokens, start, offset) | |
else | |
Matching(tokens, pos + 1, start) | |
} else { | |
NonMatch[T]() | |
} | |
} | |
override def complete: Seq[MachineState[T]] = Seq.empty | |
} | |
case class Matched[T](tokens: Seq[T], start: Int, end: Int) extends MachineState[T] { | |
override def nextToken(token: T, offset: Int): MachineState[T] = this | |
override def complete: Seq[MachineState[T]] = Seq(this) | |
} | |
case class NonMatch[T]() extends MachineState[T] { | |
override def nextToken(token: T, offset: Int): MachineState[T] = this | |
override def complete: Seq[MachineState[T]] = Seq.empty | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package tokenfsm | |
import org.scalatest.Matchers | |
/** | |
* Created by retnuh on 04/07/2016. | |
*/ | |
class TokenFSMSpecs extends org.scalatest.FlatSpec with Matchers { | |
val ws = """\s+""".r | |
def split(s: String) = ws.pattern.split(s).to[Seq] | |
val sentence = split("the quick brown fox jumped over the lazy dog") | |
"A TokenFSM" should "match the tokens and offsets from a seq of tokens" in { | |
val fsm = TokenFSM(Seq(Seq("quick", "brown"), Seq("lazy", "dog"), Seq("the", "quick", "fail"))) | |
val matches = fsm.matches(sentence) | |
println(matches) | |
matches.length shouldBe 2 | |
matches(0) shouldBe(Seq("quick", "brown"), (1, 2)) | |
matches(1) shouldBe(Seq("lazy", "dog"), (7, 8)) | |
} | |
it should "allow overlapping matches" in { | |
val fsm = TokenFSM(Seq(split("fluffy pink"), split("pink bunny"), split("pink"))) | |
val matches = fsm.matches(split("the fluffy pink bunny is awesome")) | |
matches.length shouldBe 2 | |
matches(0) shouldBe(split("fluffy pink"), (1, 2)) | |
matches(1) shouldBe(split("pink bunny"), (2, 3)) | |
} | |
it should "not match smaller token phrases that are contained in larger matches" in { | |
val fsm = TokenFSM(Seq(split("gold"), split("white gold"), split("yellow"))) | |
val matches = fsm.matches(split("yellow and white gold")) | |
println(matches) | |
matches.length shouldBe 2 | |
matches(0) shouldBe(split("yellow"), (0, 0)) | |
matches(1) shouldBe(split("white gold"), (2, 3)) | |
} | |
it should "have smaller phrases match if a larger phrase does not complete by the end of the sentence" in { | |
val fsm = TokenFSM(Seq(split("a"), split("b c"), split("b c d"), split("a b c d e f"))) | |
val matches = fsm.matches(split("a b c d e")) | |
println(matches) | |
matches.length shouldBe 2 | |
matches(0) shouldBe(split("a"), (0, 0)) | |
matches(1) shouldBe(split("b c d"), (1, 3)) | |
} | |
it should "remove duplicate matches" in { | |
val fsm = TokenFSM(Seq(split("a"), split("a b x"), split("a b y"), split("a b z"))) | |
val matches = fsm.matches(split("a b c")) | |
matches.length shouldBe 1 | |
matches(0) shouldBe(split("a"), (0, 0)) | |
} | |
it should "not include smaller matches even when one dominant match fails but other matches" in { | |
val text = "yellow rose and white gold bracelet with diamonds Cartier, price upon request. Spike cuff, Giles & Brother by Philip Crangi, $88." | |
val fsm = TokenFSM(Seq(split("yellow"), split("rose"), split("white gold"), split("white pearls"), split("gold puree"), | |
split("gold"), split("white"), split("diamonds"))) | |
val matches = fsm.matches(split(text)) | |
println(s"matches: $matches") | |
matches shouldNot contain((Seq("white"), (3, 3))) | |
matches shouldNot contain((Seq("gold"), (4, 4))) | |
matches should contain((split("white gold"), (3, 4))) | |
matches.length shouldBe 4 | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment