Skip to content

Instantly share code, notes, and snippets.

@DarkDimius
Created April 15, 2013 21:13
Show Gist options
  • Save DarkDimius/5391334 to your computer and use it in GitHub Desktop.
Save DarkDimius/5391334 to your computer and use it in GitHub Desktop.
object IMB1 extends App {
type French = String
type English = String
case class SentencePair(eng: English, fr: French)
case class Translation(eng: English, prob: Double)
case class WordPair(eng: English, fr: French)
val EPS = 1e-6
var iter = 0
def finished = iter > 5
val splitterCache = collection.mutable.Map[String, Array[String]]()
def split(s: String) = {
splitterCache.get(s) match {
case Some(r) => r
case None =>
val r = s.split(" +").filter(_ != "")
splitterCache(s) = r
r
}
}
val sentencePairs: List[SentencePair] = {
val frFile = scala.io.Source.fromFile("/home/dark/workspace/scala-games/src/main/resources/fr", "UTF-8").getLines()
val enFile = scala.io.Source.fromFile("/home/dark/workspace/scala-games/src/main/resources/en", "UTF-8").getLines()
val pairs = frFile.zip(enFile)
(for ((en: String, fr: String) <- pairs) yield SentencePair(en, fr)).toList
}
val sentencePairsSize = sentencePairs.size
val count = collection.mutable.Map[WordPair, Double]().withDefaultValue(0.)
val totalF = collection.mutable.Map[French, Double]().withDefaultValue(0.)
val translations = collection.mutable.Map[WordPair, Double]()
val sTotal = collection.mutable.Map[English, Double]().withDefaultValue(0.)
val engFreq = collection.mutable.Map[English, Double]().withDefaultValue(0.)
var totalEng = 0
sentencePairs.foreach {
case SentencePair(eng, fr) =>
{ for (word <- split(eng)) { engFreq(word) = engFreq(word) + 1; totalEng += 1 } }
}
engFreq.keySet.foreach { key => engFreq(key) = engFreq(key) / totalEng }
println("totalEng = " + totalEng)
val s = System.nanoTime
while (!finished) {
println("iteration " + iter)
count.clear()
totalF.clear()
var done = 0
sentencePairs.foreach {
case SentencePair(es, fs) => {
done = done + 1
if (done % (sentencePairsSize / 30) == 0) println(iter + "\t" + done * 1. / sentencePairsSize + "\tt\t" + (System.nanoTime - s) / 1e6 + "\tms\t" + (System.nanoTime - s) / 1e6 / (iter + done * 1. / sentencePairsSize) + "\tms mean")
sTotal.clear()
for (e <- split(es)) {
var toAdd: Double = 0
if (iter > 0) {
for (f <- split(fs)) {
toAdd = toAdd + translations(WordPair(e, f))
}
} else {
for (f <- split(fs)) {
toAdd = toAdd + engFreq(e)
}
}
sTotal(e) = sTotal(e) + toAdd
}
for (e <- split(es)) {
val etotal = sTotal(e)
val efreq = engFreq(e)
if (etotal > EPS) {
if (iter > 0) {
for (f <- split(fs)) {
val pair = WordPair(e, f)
val translationsForPair = translations(pair)
count(pair) = count(pair) + translationsForPair / etotal
totalF(f) = totalF(f) + translationsForPair / etotal
}
} else {
for (f <- split(fs)) {
val pair = WordPair(e,f)
count(pair) = count(pair) + efreq / etotal
totalF(f) = totalF(f) + efreq / etotal
}
}
}
}
} //case
} //foreach
translations.clear()
if (iter == 1) engFreq.clear()
for (wordPair <- count) {
translations(wordPair._1) = wordPair._2 / totalF(wordPair._1.fr)
}
iter = iter + 1
} //while
totalF.clear()
count.clear()
sTotal.clear()
val output = StringBuilder.newBuilder
for ((pair, prob) <- translations) {
output ++= pair.fr + "\t" + pair.eng + "\t" + prob +"\n"
}
Some(new java.io.PrintWriter("out" + java.lang.System.currentTimeMillis())).foreach { p => p.write(output.toString); p.close }
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment