Last active
December 16, 2015 06:19
-
-
Save thomasjungblut/5390600 to your computer and use it in GitHub Desktop.
Inverted Index in less than 50 lines of code (and I was verbose!)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package de.jungblut.index | |
import scala.collection.mutable.ArrayBuffer | |
import scala.collection.mutable.MultiMap | |
import scala.collection.mutable.HashMap | |
import scala.collection.mutable.Set | |
final class Index[T](nGramSize: Int) { | |
private val index = new HashMap[String, Set[T]] with MultiMap[String, T] | |
def build(content: Map[String, T]) = { | |
for (docTerm <- content) yield tokenize(docTerm._1).foreach(index.addBinding(_, docTerm._2)) | |
this // return this so people can chain calls to construction and build | |
} | |
private def tokenize(sx: String, index: Int = 0): ArrayBuffer[String] = { | |
if (index >= sx.length() - 1) { | |
ArrayBuffer[String]() // at the end of the string, get our buffer | |
} else { | |
// recurse further into the string | |
val tokens = tokenize(sx, index + 1) | |
// if we are far away from the end, add to the tokens | |
if (index + nGramSize <= sx.length()) | |
return (tokens :+ sx.substring(index, index + nGramSize)) | |
tokens | |
} | |
} | |
def query(query: String, minScore: Double = 0.0) = { | |
val tokens = tokenize(query) | |
val documentSets = tokens.flatMap(tokenize(_)).distinct.flatMap(index.get(_)) | |
val counts = documentSets.flatMap(_.toSet).groupBy(token => token).mapValues(_.size) | |
// now calculate the match score by dividing the counts by the tokens | |
counts.map(Function.tupled((k, v) => (k, v / tokens.size.doubleValue()))).filter(_._2 > minScore).toList.sortBy(-_._2) | |
} | |
} | |
object Main { | |
def main(args: Array[String]): Unit = { | |
val index = new Index(3).build(Map("I'm cool" -> "cool doc", "rofl!!! that was funny" -> "rofl doc", "omg that was funny!11 lol" -> "what a funny doc!")) | |
println(index.query("funny I'm")) | |
println(index.query("that was so funny lol"),0.5) | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment