Created
November 28, 2012 11:22
-
-
Save meglio/4160578 to your computer and use it in GitHub Desktop.
Parse opencorpora dictionary
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package com.frazemet | |
import scala.io.Source | |
import xml.pull._ | |
import collection.mutable.ListBuffer | |
object OpenCorpora { | |
def importFromXml(file: String) { | |
val source = Source.fromFile(file)(("UTF-8")) | |
val xmlEvents = new XMLEventReader(source) | |
sealed abstract class Parsed { | |
def save() { | |
println(s"Saving: $toString") | |
} | |
} | |
object NoInterest extends Parsed | |
trait WithGrammems extends Parsed { | |
val g: ListBuffer[String] = ListBuffer.empty | |
} | |
case class WordForm(lemma: Lemma, text: String) extends WithGrammems { | |
override def toString = s"""WordForm("$text", g:${g.mkString(",")})""" | |
} | |
case class Lemma(id: BigInt, rev: BigInt) extends WithGrammems { | |
var text = "" | |
val f: ListBuffer[WordForm] = ListBuffer.empty | |
override def toString = s"""Lemma("$text", ID: $id, rev: $rev, g:${g.mkString(",")}, f:\n${f.mkString("- ", "\n- ", "")} )""" | |
} | |
case class Link(id: BigInt, from: BigInt, to: BigInt, typeNum: Int) extends Parsed { | |
} | |
object Factory { | |
def ensureKeys(entityName: String, keys: List[String], map: Map[String, String]) { | |
keys foreach (s => | |
if (!map.isDefinedAt(s)) | |
throw new Error(s"$entityName: '$s' missing")) | |
} | |
def newWordForm(lemma: Lemma, xmlAttrs: Map[String, String]) = { | |
ensureKeys("Word Form (<f>)", List("t"), xmlAttrs) | |
WordForm(lemma, xmlAttrs("t")) | |
} | |
def newLemma(xmlAttrs: Map[String, String]) = { | |
ensureKeys("Lemma (<lemma>)", List("id", "rev"), xmlAttrs) | |
Lemma(BigInt(xmlAttrs("id")), BigInt(xmlAttrs("rev"))) | |
} | |
def newLink(xmlAttrs: Map[String, String]) = { | |
ensureKeys("Link (<l>)", List("id", "from", "to", "type"), xmlAttrs) | |
Link(BigInt(xmlAttrs("id")), BigInt(xmlAttrs("from")), BigInt(xmlAttrs("to")), xmlAttrs("type").toInt) | |
} | |
} | |
def parse(xml: XMLEventReader) { | |
def loop(curParsed: Parsed) { | |
if (xml.hasNext) { | |
val nextCurParsed = xml.next() match { | |
case EvElemStart(_, label, attrs, _) => | |
val map = attrs.asAttrMap | |
(label, curParsed) match { | |
case ("lemma", NoInterest) => | |
Factory.newLemma(map) | |
case ("link", NoInterest) => | |
Factory.newLink(map) | |
case ("f", p: Lemma) => | |
val f = Factory.newWordForm(p, map) | |
p.f.append(f) | |
f | |
case ("l", p: Lemma) if map.isDefinedAt("t") => | |
p.text = map("t") | |
p | |
case ("g", p: WithGrammems) if map.isDefinedAt("v") => | |
p.g.append(map("v")) | |
p | |
case _ => | |
curParsed | |
} | |
case EvElemEnd(_, label) => | |
(label, curParsed) match { | |
case ("lemma", p: Lemma) => | |
p.save() | |
NoInterest | |
case ("link", p: Link) => | |
p.save() | |
NoInterest | |
case ("l", _) => | |
curParsed | |
case ("f", p: WordForm) => | |
p.lemma | |
case _ => | |
curParsed | |
} | |
case _ => | |
curParsed | |
} | |
loop(nextCurParsed) | |
} | |
} | |
loop(NoInterest) | |
} | |
parse(xmlEvents) | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment