Skip to content

Instantly share code, notes, and snippets.

@piyo7
Last active March 4, 2016 07:43
Show Gist options
  • Save piyo7/f5479b62fc920baaf5e0 to your computer and use it in GitHub Desktop.
Save piyo7/f5479b62fc920baaf5e0 to your computer and use it in GitHub Desktop.
メモ化再帰で始める構文解析 ref: http://qiita.com/piyo7/items/3bec96ee2b5c052c3435
sbt.version=0.13.11
name := "pcfg"
version := "0.0.0"
scalaVersion := "2.11.7"
libraryDependencies ++= Seq(
"com.atilika.kuromoji" % "kuromoji-ipadic" % "0.9.0",
"org.scalaz" %% "scalaz-core" % "7.2.1"
)
S 0.2 (8.00e-04)
|
+- 名詞句 1.0 (1.00e-01)
| |
| +- 名詞 1.0 (1.00e-01)
| | |
| | +- 形容詞 0.1 (1.00e-01)
| | | |
| | | +- 名詞 - 隣
| | | |
| | | `- 助詞 - の
| | |
| | `- 名詞 - 客
| |
| `- 助詞 - は
|
`- 動詞 0.5 (4.00e-02)
|
+- 名詞 1.0 (8.00e-02)
| |
| +- 形容詞 0.4 (8.00e-02)
| | |
| | +- 副詞 - よく
| | |
| | `- 形容詞 0.2 (2.00e-01)
| | |
| | +- 名詞 - 柿
| | |
| | `- 動詞 - 食う
| |
| `- 名詞 - 客
|
`- 助動詞 - だ
case class Rule(left: String, right1: String, right2: String, prob: Double)
case class PcfgNode(label: Either[Rule, Token], prob: Double)
def calcPcfgTree: ((Seq[Token], Seq[Rule], String)) => Option[Tree[PcfgNode]] =
Memo.mutableHashMapMemo { case (inputs: Seq[Token], rules: Seq[Rule], target: String) =>
inputs.size match {
case 0 =>
None
case 1 =>
if (inputs.head.getPartOfSpeechLevel1 == target) {
Some(PcfgNode(Right(inputs.head), 1.0).leaf)
} else {
None
}
case size =>
val trees =
for {
rule <- rules if rule.left == target
i <- Range(1, size)
child1 <- calcPcfgTree(inputs.take(i), rules, rule.right1)
child2 <- calcPcfgTree(inputs.takeRight(size - i), rules, rule.right2)
} yield {
val rootProb = rule.prob * child1.rootLabel.prob * child2.rootLabel.prob
PcfgNode(Left(rule), rootProb).node(child1, child2)
}
trees.reduceOption(Ordering.by((_: Tree[PcfgNode]).rootLabel.prob).max)
}
}
implicit val showPcfgNode = Show.show { node: PcfgNode =>
node.label match {
case Left(rule) => f"${rule.left} ${rule.prob} (${node.prob}%.2e)"
case Right(token) => f"${token.getPartOfSpeechLevel1} - ${token.getBaseForm}"
}
}
for (r <- result) println(r.drawTree)
import com.atilika.kuromoji.ipadic.{Token, Tokenizer}
import scala.collection.JavaConverters._
import scalaz.Scalaz.ToTreeOps
import scalaz.{Memo, Show, Tree}
case class Rule(left: String, right1: String, right2: String, prob: Double)
case class PcfgNode(label: Either[Rule, Token], prob: Double)
object Main extends App {
val rules = Seq(
Rule("S", "名詞句", "形容詞", 0.5),
Rule("S", "名詞句", "名詞", 0.3),
Rule("S", "名詞句", "動詞", 0.2),
Rule("名詞", "形容詞", "名詞", 1),
Rule("名詞句", "名詞", "助詞", 1),
Rule("形容詞", "名詞", "助詞", 0.1),
Rule("形容詞", "名詞", "動詞", 0.2),
Rule("形容詞", "副詞", "形容詞", 0.4),
Rule("形容詞", "副詞", "形容詞", 0.3),
Rule("動詞", "副詞", "動詞句", 0.5),
Rule("動詞", "名詞", "助動詞", 0.5)
)
val tokenizer = new Tokenizer()
val tokens = tokenizer.tokenize("隣の客はよく柿食う客だ").asScala.toVector
val result = calcPcfgTree(tokens, rules, "S")
implicit val showPcfgNode = Show.show { node: PcfgNode =>
node.label match {
case Left(rule) => f"${rule.left} ${rule.prob} (${node.prob}%.2e)"
case Right(token) => f"${token.getPartOfSpeechLevel1} - ${token.getBaseForm}"
}
}
for (r <- result) println(r.drawTree)
def calcPcfgTree: ((Seq[Token], Seq[Rule], String)) => Option[Tree[PcfgNode]] =
Memo.mutableHashMapMemo { case (inputs: Seq[Token], rules: Seq[Rule], target: String) =>
inputs.size match {
case 0 =>
None
case 1 =>
if (inputs.head.getPartOfSpeechLevel1 == target) {
Some(PcfgNode(Right(inputs.head), 1.0).leaf)
} else {
None
}
case size =>
val trees =
for {
rule <- rules if rule.left == target
i <- Range(1, size)
child1 <- calcPcfgTree(inputs.take(i), rules, rule.right1)
child2 <- calcPcfgTree(inputs.takeRight(size - i), rules, rule.right2)
} yield {
val rootProb = rule.prob * child1.rootLabel.prob * child2.rootLabel.prob
PcfgNode(Left(rule), rootProb).node(child1, child2)
}
trees.reduceOption(Ordering.by((_: Tree[PcfgNode]).rootLabel.prob).max)
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment