Last active
March 4, 2016 07:43
-
-
Save piyo7/f5479b62fc920baaf5e0 to your computer and use it in GitHub Desktop.
メモ化再帰で始める構文解析 ref: http://qiita.com/piyo7/items/3bec96ee2b5c052c3435
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
sbt.version=0.13.11 |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
name := "pcfg" | |
version := "0.0.0" | |
scalaVersion := "2.11.7" | |
libraryDependencies ++= Seq( | |
"com.atilika.kuromoji" % "kuromoji-ipadic" % "0.9.0", | |
"org.scalaz" %% "scalaz-core" % "7.2.1" | |
) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
S 0.2 (8.00e-04) | |
| | |
+- 名詞句 1.0 (1.00e-01) | |
| | | |
| +- 名詞 1.0 (1.00e-01) | |
| | | | |
| | +- 形容詞 0.1 (1.00e-01) | |
| | | | | |
| | | +- 名詞 - 隣 | |
| | | | | |
| | | `- 助詞 - の | |
| | | | |
| | `- 名詞 - 客 | |
| | | |
| `- 助詞 - は | |
| | |
`- 動詞 0.5 (4.00e-02) | |
| | |
+- 名詞 1.0 (8.00e-02) | |
| | | |
| +- 形容詞 0.4 (8.00e-02) | |
| | | | |
| | +- 副詞 - よく | |
| | | | |
| | `- 形容詞 0.2 (2.00e-01) | |
| | | | |
| | +- 名詞 - 柿 | |
| | | | |
| | `- 動詞 - 食う | |
| | | |
| `- 名詞 - 客 | |
| | |
`- 助動詞 - だ |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
case class Rule(left: String, right1: String, right2: String, prob: Double) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
case class PcfgNode(label: Either[Rule, Token], prob: Double) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def calcPcfgTree: ((Seq[Token], Seq[Rule], String)) => Option[Tree[PcfgNode]] = | |
Memo.mutableHashMapMemo { case (inputs: Seq[Token], rules: Seq[Rule], target: String) => | |
inputs.size match { | |
case 0 => | |
None | |
case 1 => | |
if (inputs.head.getPartOfSpeechLevel1 == target) { | |
Some(PcfgNode(Right(inputs.head), 1.0).leaf) | |
} else { | |
None | |
} | |
case size => | |
val trees = | |
for { | |
rule <- rules if rule.left == target | |
i <- Range(1, size) | |
child1 <- calcPcfgTree(inputs.take(i), rules, rule.right1) | |
child2 <- calcPcfgTree(inputs.takeRight(size - i), rules, rule.right2) | |
} yield { | |
val rootProb = rule.prob * child1.rootLabel.prob * child2.rootLabel.prob | |
PcfgNode(Left(rule), rootProb).node(child1, child2) | |
} | |
trees.reduceOption(Ordering.by((_: Tree[PcfgNode]).rootLabel.prob).max) | |
} | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
implicit val showPcfgNode = Show.show { node: PcfgNode => | |
node.label match { | |
case Left(rule) => f"${rule.left} ${rule.prob} (${node.prob}%.2e)" | |
case Right(token) => f"${token.getPartOfSpeechLevel1} - ${token.getBaseForm}" | |
} | |
} | |
for (r <- result) println(r.drawTree) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import com.atilika.kuromoji.ipadic.{Token, Tokenizer} | |
import scala.collection.JavaConverters._ | |
import scalaz.Scalaz.ToTreeOps | |
import scalaz.{Memo, Show, Tree} | |
case class Rule(left: String, right1: String, right2: String, prob: Double) | |
case class PcfgNode(label: Either[Rule, Token], prob: Double) | |
object Main extends App { | |
val rules = Seq( | |
Rule("S", "名詞句", "形容詞", 0.5), | |
Rule("S", "名詞句", "名詞", 0.3), | |
Rule("S", "名詞句", "動詞", 0.2), | |
Rule("名詞", "形容詞", "名詞", 1), | |
Rule("名詞句", "名詞", "助詞", 1), | |
Rule("形容詞", "名詞", "助詞", 0.1), | |
Rule("形容詞", "名詞", "動詞", 0.2), | |
Rule("形容詞", "副詞", "形容詞", 0.4), | |
Rule("形容詞", "副詞", "形容詞", 0.3), | |
Rule("動詞", "副詞", "動詞句", 0.5), | |
Rule("動詞", "名詞", "助動詞", 0.5) | |
) | |
val tokenizer = new Tokenizer() | |
val tokens = tokenizer.tokenize("隣の客はよく柿食う客だ").asScala.toVector | |
val result = calcPcfgTree(tokens, rules, "S") | |
implicit val showPcfgNode = Show.show { node: PcfgNode => | |
node.label match { | |
case Left(rule) => f"${rule.left} ${rule.prob} (${node.prob}%.2e)" | |
case Right(token) => f"${token.getPartOfSpeechLevel1} - ${token.getBaseForm}" | |
} | |
} | |
for (r <- result) println(r.drawTree) | |
def calcPcfgTree: ((Seq[Token], Seq[Rule], String)) => Option[Tree[PcfgNode]] = | |
Memo.mutableHashMapMemo { case (inputs: Seq[Token], rules: Seq[Rule], target: String) => | |
inputs.size match { | |
case 0 => | |
None | |
case 1 => | |
if (inputs.head.getPartOfSpeechLevel1 == target) { | |
Some(PcfgNode(Right(inputs.head), 1.0).leaf) | |
} else { | |
None | |
} | |
case size => | |
val trees = | |
for { | |
rule <- rules if rule.left == target | |
i <- Range(1, size) | |
child1 <- calcPcfgTree(inputs.take(i), rules, rule.right1) | |
child2 <- calcPcfgTree(inputs.takeRight(size - i), rules, rule.right2) | |
} yield { | |
val rootProb = rule.prob * child1.rootLabel.prob * child2.rootLabel.prob | |
PcfgNode(Left(rule), rootProb).node(child1, child2) | |
} | |
trees.reduceOption(Ordering.by((_: Tree[PcfgNode]).rootLabel.prob).max) | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment