Skip to content

Instantly share code, notes, and snippets.

View erikerlandson's full-sized avatar

Erik Erlandson erikerlandson

View GitHub Profile
@erikerlandson
erikerlandson / distance.scala
Created August 21, 2014 22:51
Suggestion for simple distance metric (and measure) design
import breeze.linalg.{Vector => BV, DenseVector => DBV}
import org.apache.spark.annotation.Experimental
import org.apache.spark.mllib.linalg.{Vector, DenseVector}
trait DistanceMeasure extends Function2[BV[Double], BV[Double], Double] with Serializable {
// each measure/metric defines for itself:
override def apply(v1: BV[Double], v2: BV[Double]): Double
// a catch-all overloading of "()" for spark vectors
// can also be overridden on a per-class basis, if it is advantageous
@erikerlandson
erikerlandson / absTest.scala
Last active August 29, 2015 14:05
test performance of breeze abs(v) vs v.map(Math.abs(_))
import breeze.linalg.{Vector => BV, DenseVector => DBV}
import breeze.numerics.abs
def absTest(n: Int = 1000, m: Int = 10) {
var start = System.currentTimeMillis()
val vectors = (1 to n).toArray.map(i => DBV((1 to m).toArray.map(j => -1.0 + (2.0 * Math.random()))))
var end = System.currentTimeMillis()
println(s"Time for Creating: ${end - start}")
start = System.currentTimeMillis()
@erikerlandson
erikerlandson / results.txt
Created August 29, 2014 02:55
Experiment with faster sampling logic
scala> benchmark(p = 0.01, n=1000, m = 100000)
Time using drop: 83
Time using filter: 2849
scala> benchmark(p = 0.01, n=1000, m = 100000)
Time using drop: 90
Time using filter: 2668
scala> benchmark(p = 0.1, n=1000, m = 100000)
Time using drop: 941
@erikerlandson
erikerlandson / results.txt
Last active August 29, 2015 14:05
test drop-sampling benchmarks with different collection types
scala> benchmark(p = 0.001, n=1000, m = 10000)
Array
Time using filter: 411
Time using drop: 9
Seq
Time using filter: 425
Time using drop: 3
@erikerlandson
erikerlandson / benchmark_drop.scala
Created August 29, 2014 20:06
Some benchmarking tests to localize differences between "drop sampling" and "filter sampling"
import scala.util.Random
def benchmark_drop(n: Int = 1000, m: Int = 1000, k: Int = 10) {
val rng = new Random()
def dropIter[T](d: Iterator[T]) {
var dd = d
while (! dd.isEmpty) dd = dd.drop(rng.nextInt())
}
@erikerlandson
erikerlandson / drop_test.scala
Created September 3, 2014 23:00
Demonstrate anonymous class nesting from Iterator drop method
import java.io.{StringWriter, PrintWriter}
import scala.reflect.ClassTag
def tracehead(e: Exception, substr: String = "slice"): String = {
val sw = new StringWriter()
e.printStackTrace(new PrintWriter(sw))
sw.toString.split('\n').takeWhile((s:String)=> !s.contains(substr)).drop(1).mkString("\n")
}
class TestIterator[T: ClassTag](val iter: Iterator[T]) extends Iterator[T] {
@erikerlandson
erikerlandson / variation1.scala
Created September 5, 2014 02:21
variations I tried
/** Advances this iterator past the first ''n'' elements, or the length of the iterator, whichever is smaller.
*
* @param n the number of elements to drop
* @return an iterator which produces all values of the current iterator, except
* it omits the first `n` values.
* @note Reuse: $consumesAndProducesIterator
*/
def drop(n: Int): Iterator[A] = slice(n, Int.MaxValue)
/** Creates an iterator returning an interval of the values produced by this iterator.
@erikerlandson
erikerlandson / gap_sampling.scala
Last active July 27, 2017 08:28
Prototype iterators that use efficient gap sampling algorithms
import scala.reflect.ClassTag
import scala.sys.process._
class PoissonDist(p: Double) {
val q = Math.exp(-p)
def next: Int = {
var r = 0
var pp = Math.random()
@erikerlandson
erikerlandson / kstest.py
Created September 9, 2014 21:22
A python widget to do a Kolmogorov Smirnov test against two data sets
import sys
import argparse
from scipy import stats
argparser = argparse.ArgumentParser()
argparser.add_argument('data1', type=argparse.FileType('r'), metavar='<data-file-1>')
argparser.add_argument('data2', type=argparse.FileType('r'), metavar='<data-file-2>')
args = argparser.parse_args()
@erikerlandson
erikerlandson / .gitconfig
Created October 23, 2014 23:18
git alias to edit repo files using sed
[alias]
# use sed to edit files from 'git ls-files' in-place using the editing expression
# git sed [ args-to-git-ls-files ] <sed-expr>
# git sed s/foo/bar/g
# git sed --modified s/foo/bar/g
# handles nasty file names, e.g. with whitespaces, punctuation etc
sed = "!f() { let n=$#-1; let n=$((n>0?n:0)); eval sed -i ${@: -1} $(git ls-files ${@: 1:$n} | awk '{printf(\"\\\"%s\\\"\\n\", $0)}'); }; f"