kaja47 · December 30, 2014 04:58
diff --git a/csfdsim.scala b/csfdsim.scala
 import breeze.linalg._
 import breeze.stats
 import breeze.numerics._

 val dataFile = new File(???)
 val userItems: Array[SparseVector[Double]] = loaderUserItemsWithRatings(dataFile, """[ ,:]""".r)
 val itemUsers: Array[SparseVector[Double]] = transpose(userItems) map { vec => normalize(vec, 2) }

 // weights
 val N = DenseVector.fill[Double](itemIndex.size)(userIndex.size) // vector where total numbers of users is repeated
 val ratings = DenseVector[Double](itemUsers.map(_.activeSize.toDouble).toArray) // number of ratings of every item

 // weights of every item, most popular items have lesser weight
 // log(2.75) ~= 1, items with more than N/2.75/const ratings have weight smaller that 1
 val w = max(min(log(N :/ ratings :/ 2.0), 1.0), 0.05) 

 // count only movies with at least this number or ratings
 val minRatings = 50

 // length of sketch in bits
 val sketchLength = 512

 // generate random hyperplanes and create sketches
 val _sketches: Array[java.util.BitSet] = {
 	val randomHyperplanes = (0 until sketchLength).par map { _ => DenseVector.rand[Double](userIndex.size) map (x => if (x < 0.5) -1.0 else 1.0) } seq
 	itemUsers.par map { vec => BitVector(randomHyperplanes map { rhp => (rhp dot vec) > 0.0 }: _*).data } toArray
 }


 // select candidate movies with at least minRatings ratings
 val candidates = 0 until itemUsers.size filter (i => itemUsers(i).activeSize > minRatings)

 // copy sketches into one big array for that sweet sweet cache locality
 val sketches: Array[Long] = {
 	val arr = new Array[Long](candidates.size * sketchLength / 64)
 	candidates.zipWithIndex foreach { case (i, idx) =>
 		val bits = _sketches(i).toLongArray
 		java.lang.System.arraycopy(bits, 0, arr, idx*sketchLength/64, sketchLength/64)
 	}
 	arr
 }

 // Estimate similarity from sketches, it's not estimate of cosine similatity
 // but only estimate of similarity based on angle, but oh well, i don't really
 // care and it still works when one is interested only in thresholds.
 // This can be speed up by precomputing thresholds in bits and compare it to
 // number of different bits in two sketches.
 def estimateSimilarity(arr: Array[Long], sketchLength: Int, a: Int, b: Int): Double = {
 	var diffBits = 0
 	var i = 0
 	val longsLen = sketchLength / 64
 	while (i < longsLen) {
 		diffBits += java.lang.Long.bitCount(arr(a*longsLen+i) ^ arr(b*longsLen+i))
 		i += 1
 	}
 	1.0 - (diffBits / sketchLength.toDouble)*2.0
 }

 // compute similarities
 (for {
 	i <- (0 until candidates.size).par
 } yield {
 	for {
 		j <- i+1 until candidates.size
 		est = estimateSimilarity(sketches, sketchLength, i, j)
 		if est > 0.16 // aim low
 		sim = (itemUsers(candidates(i)) dot itemUsers(candidates(j))) * min(w(candidates(i)), w(candidates(j))) // cosine similarity in this case don't really make sense and Euclidean distance would be better, but it still works
 		if sim > 0.2 // go for the kill
 	} yield {
 		(candidates(i), candidates(j), sim)
 	}
 }).flatten.seq
	import breeze.linalg._
	import breeze.stats
	import breeze.numerics._

	val dataFile = new File(???)
	val userItems: Array[SparseVector[Double]] = loaderUserItemsWithRatings(dataFile, """[ ,:]""".r)
	val itemUsers: Array[SparseVector[Double]] = transpose(userItems) map { vec => normalize(vec, 2) }

	// weights
	val N = DenseVector.fill[Double](itemIndex.size)(userIndex.size) // vector where total numbers of users is repeated
	val ratings = DenseVector[Double](itemUsers.map(_.activeSize.toDouble).toArray) // number of ratings of every item

	// weights of every item, most popular items have lesser weight
	// log(2.75) ~= 1, items with more than N/2.75/const ratings have weight smaller that 1
	val w = max(min(log(N :/ ratings :/ 2.0), 1.0), 0.05)

	// count only movies with at least this number or ratings
	val minRatings = 50

	// length of sketch in bits
	val sketchLength = 512

	// generate random hyperplanes and create sketches
	val _sketches: Array[java.util.BitSet] = {
	val randomHyperplanes = (0 until sketchLength).par map { _ => DenseVector.rand[Double](userIndex.size) map (x => if (x < 0.5) -1.0 else 1.0) } seq
	itemUsers.par map { vec => BitVector(randomHyperplanes map { rhp => (rhp dot vec) > 0.0 }: _*).data } toArray
	}


	// select candidate movies with at least minRatings ratings
	val candidates = 0 until itemUsers.size filter (i => itemUsers(i).activeSize > minRatings)

	// copy sketches into one big array for that sweet sweet cache locality
	val sketches: Array[Long] = {
	val arr = new Array[Long](candidates.size * sketchLength / 64)
	candidates.zipWithIndex foreach { case (i, idx) =>
	val bits = _sketches(i).toLongArray
	java.lang.System.arraycopy(bits, 0, arr, idx*sketchLength/64, sketchLength/64)
	}
	arr
	}

	// Estimate similarity from sketches, it's not estimate of cosine similatity
	// but only estimate of similarity based on angle, but oh well, i don't really
	// care and it still works when one is interested only in thresholds.
	// This can be speed up by precomputing thresholds in bits and compare it to
	// number of different bits in two sketches.
	def estimateSimilarity(arr: Array[Long], sketchLength: Int, a: Int, b: Int): Double = {
	var diffBits = 0
	var i = 0
	val longsLen = sketchLength / 64
	while (i < longsLen) {
	diffBits += java.lang.Long.bitCount(arr(alongsLen+i) ^ arr(blongsLen+i))
	i += 1
	}
	1.0 - (diffBits / sketchLength.toDouble)*2.0
	}

	// compute similarities
	(for {
	i <- (0 until candidates.size).par
	} yield {
	for {
	j <- i+1 until candidates.size
	est = estimateSimilarity(sketches, sketchLength, i, j)
	if est > 0.16 // aim low
	sim = (itemUsers(candidates(i)) dot itemUsers(candidates(j))) * min(w(candidates(i)), w(candidates(j))) // cosine similarity in this case don't really make sense and Euclidean distance would be better, but it still works
	if sim > 0.2 // go for the kill
	} yield {
	(candidates(i), candidates(j), sim)
	}
	}).flatten.seq