Created
December 30, 2014 04:58
-
-
Save kaja47/3050c5ad64f3da973eef to your computer and use it in GitHub Desktop.
How to compute similar movies from CSFD data in 10 minutes and find love of your life
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import breeze.linalg._ | |
import breeze.stats | |
import breeze.numerics._ | |
val dataFile = new File(???) | |
val userItems: Array[SparseVector[Double]] = loaderUserItemsWithRatings(dataFile, """[ ,:]""".r) | |
val itemUsers: Array[SparseVector[Double]] = transpose(userItems) map { vec => normalize(vec, 2) } | |
// weights | |
val N = DenseVector.fill[Double](itemIndex.size)(userIndex.size) // vector where total numbers of users is repeated | |
val ratings = DenseVector[Double](itemUsers.map(_.activeSize.toDouble).toArray) // number of ratings of every item | |
// weights of every item, most popular items have lesser weight | |
// log(2.75) ~= 1, items with more than N/2.75/const ratings have weight smaller that 1 | |
val w = max(min(log(N :/ ratings :/ 2.0), 1.0), 0.05) | |
// count only movies with at least this number or ratings | |
val minRatings = 50 | |
// length of sketch in bits | |
val sketchLength = 512 | |
// generate random hyperplanes and create sketches | |
val _sketches: Array[java.util.BitSet] = { | |
val randomHyperplanes = (0 until sketchLength).par map { _ => DenseVector.rand[Double](userIndex.size) map (x => if (x < 0.5) -1.0 else 1.0) } seq | |
itemUsers.par map { vec => BitVector(randomHyperplanes map { rhp => (rhp dot vec) > 0.0 }: _*).data } toArray | |
} | |
// select candidate movies with at least minRatings ratings | |
val candidates = 0 until itemUsers.size filter (i => itemUsers(i).activeSize > minRatings) | |
// copy sketches into one big array for that sweet sweet cache locality | |
val sketches: Array[Long] = { | |
val arr = new Array[Long](candidates.size * sketchLength / 64) | |
candidates.zipWithIndex foreach { case (i, idx) => | |
val bits = _sketches(i).toLongArray | |
java.lang.System.arraycopy(bits, 0, arr, idx*sketchLength/64, sketchLength/64) | |
} | |
arr | |
} | |
// Estimate similarity from sketches, it's not estimate of cosine similatity | |
// but only estimate of similarity based on angle, but oh well, i don't really | |
// care and it still works when one is interested only in thresholds. | |
// This can be speed up by precomputing thresholds in bits and compare it to | |
// number of different bits in two sketches. | |
def estimateSimilarity(arr: Array[Long], sketchLength: Int, a: Int, b: Int): Double = { | |
var diffBits = 0 | |
var i = 0 | |
val longsLen = sketchLength / 64 | |
while (i < longsLen) { | |
diffBits += java.lang.Long.bitCount(arr(a*longsLen+i) ^ arr(b*longsLen+i)) | |
i += 1 | |
} | |
1.0 - (diffBits / sketchLength.toDouble)*2.0 | |
} | |
// compute similarities | |
(for { | |
i <- (0 until candidates.size).par | |
} yield { | |
for { | |
j <- i+1 until candidates.size | |
est = estimateSimilarity(sketches, sketchLength, i, j) | |
if est > 0.16 // aim low | |
sim = (itemUsers(candidates(i)) dot itemUsers(candidates(j))) * min(w(candidates(i)), w(candidates(j))) // cosine similarity in this case don't really make sense and Euclidean distance would be better, but it still works | |
if sim > 0.2 // go for the kill | |
} yield { | |
(candidates(i), candidates(j), sim) | |
} | |
}).flatten.seq |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment