Skip to content

Instantly share code, notes, and snippets.

@meglio
Created November 28, 2012 11:22
Show Gist options
  • Save meglio/4160578 to your computer and use it in GitHub Desktop.
Save meglio/4160578 to your computer and use it in GitHub Desktop.
Parse opencorpora dictionary
package com.frazemet
import scala.io.Source
import xml.pull._
import collection.mutable.ListBuffer
object OpenCorpora {
def importFromXml(file: String) {
val source = Source.fromFile(file)(("UTF-8"))
val xmlEvents = new XMLEventReader(source)
sealed abstract class Parsed {
def save() {
println(s"Saving: $toString")
}
}
object NoInterest extends Parsed
trait WithGrammems extends Parsed {
val g: ListBuffer[String] = ListBuffer.empty
}
case class WordForm(lemma: Lemma, text: String) extends WithGrammems {
override def toString = s"""WordForm("$text", g:${g.mkString(",")})"""
}
case class Lemma(id: BigInt, rev: BigInt) extends WithGrammems {
var text = ""
val f: ListBuffer[WordForm] = ListBuffer.empty
override def toString = s"""Lemma("$text", ID: $id, rev: $rev, g:${g.mkString(",")}, f:\n${f.mkString("- ", "\n- ", "")} )"""
}
case class Link(id: BigInt, from: BigInt, to: BigInt, typeNum: Int) extends Parsed {
}
object Factory {
def ensureKeys(entityName: String, keys: List[String], map: Map[String, String]) {
keys foreach (s =>
if (!map.isDefinedAt(s))
throw new Error(s"$entityName: '$s' missing"))
}
def newWordForm(lemma: Lemma, xmlAttrs: Map[String, String]) = {
ensureKeys("Word Form (<f>)", List("t"), xmlAttrs)
WordForm(lemma, xmlAttrs("t"))
}
def newLemma(xmlAttrs: Map[String, String]) = {
ensureKeys("Lemma (<lemma>)", List("id", "rev"), xmlAttrs)
Lemma(BigInt(xmlAttrs("id")), BigInt(xmlAttrs("rev")))
}
def newLink(xmlAttrs: Map[String, String]) = {
ensureKeys("Link (<l>)", List("id", "from", "to", "type"), xmlAttrs)
Link(BigInt(xmlAttrs("id")), BigInt(xmlAttrs("from")), BigInt(xmlAttrs("to")), xmlAttrs("type").toInt)
}
}
def parse(xml: XMLEventReader) {
def loop(curParsed: Parsed) {
if (xml.hasNext) {
val nextCurParsed = xml.next() match {
case EvElemStart(_, label, attrs, _) =>
val map = attrs.asAttrMap
(label, curParsed) match {
case ("lemma", NoInterest) =>
Factory.newLemma(map)
case ("link", NoInterest) =>
Factory.newLink(map)
case ("f", p: Lemma) =>
val f = Factory.newWordForm(p, map)
p.f.append(f)
f
case ("l", p: Lemma) if map.isDefinedAt("t") =>
p.text = map("t")
p
case ("g", p: WithGrammems) if map.isDefinedAt("v") =>
p.g.append(map("v"))
p
case _ =>
curParsed
}
case EvElemEnd(_, label) =>
(label, curParsed) match {
case ("lemma", p: Lemma) =>
p.save()
NoInterest
case ("link", p: Link) =>
p.save()
NoInterest
case ("l", _) =>
curParsed
case ("f", p: WordForm) =>
p.lemma
case _ =>
curParsed
}
case _ =>
curParsed
}
loop(nextCurParsed)
}
}
loop(NoInterest)
}
parse(xmlEvents)
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment