|
import breeze.linalg._ |
|
import scala.collection.mutable.Map |
|
|
|
object FrequencyTable extends App { |
|
val lines = io.Source.fromFile("t:/frequency.csv").getLines.take(10000000).toList ++ |
|
(List("washington", "taxes", "treasury") map { w => s"test,$w,1" }) |
|
val docs = Map.empty[String, Int] |
|
val words = Map.empty[String, Int] |
|
var docCount = -1 |
|
var wordCount = -1 |
|
|
|
init() |
|
val matrix = initMatrix |
|
|
|
val time = System.nanoTime / 1000000 |
|
val prod = matrix * matrix.t |
|
val duration = System.nanoTime / 1000000 - time |
|
println(s"computed in $duration ms") |
|
|
|
val rows = prod.rows |
|
val cols = prod.cols |
|
assert(cols == docs.size) |
|
|
|
val lastRow = prod(rows - 1 until rows, 0 until cols).iterator.toList.sortBy(-_._2) |
|
|
|
println(prod.toDense) |
|
|
|
for (((_, doc), value) <- lastRow if value > 0) |
|
println(doc -> value) |
|
|
|
def init(): Unit = { |
|
for { |
|
l <- lines |
|
Array(doc, word, _) = l.split(",") |
|
} { |
|
addDoc(doc) |
|
addWord(word) |
|
} |
|
|
|
println(s"${docs.size} docs") |
|
println(s"${words.size} words") |
|
} |
|
|
|
def initMatrix(): CSCMatrix[Int] = { |
|
val builder = new CSCMatrix.Builder[Int](rows = docs.size, cols = words.size) |
|
for { |
|
l <- lines |
|
Array(doc, word, value) = l.split(",") |
|
row = docs(doc) |
|
col = words(word) |
|
} { |
|
builder.add(row, col, value.toInt) |
|
} |
|
builder.result() |
|
} |
|
|
|
def addDoc(doc: String): Int = |
|
docs.getOrElseUpdate(doc, { docCount += 1; docCount }) |
|
|
|
def addWord(w: String): Int = |
|
words.getOrElseUpdate(w, { wordCount += 1; wordCount }) |
|
|
|
} |