Created
February 8, 2013 15:11
-
-
Save jdegoes/4739573 to your computer and use it in GitHub Desktop.
Example code for the Creating a Data Science Platform in Scala talk.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
object BenchmarkCommon { | |
import scala.util.Random | |
val DatasetSize = 10000 | |
val Iterations = 10000 | |
val ArrayPoolSize = 1000 | |
val ArrayPool = { | |
def randomArray(): Array[Int] = { | |
val array = new Array[Int](DatasetSize) | |
(0 until array.length).foreach { index => | |
array(index) = Random.nextInt(DatasetSize) | |
} | |
array | |
} | |
val pool = new Array[Array[Int]](ArrayPoolSize) | |
(0 until pool.length).foreach { index => | |
pool(index) = randomArray() | |
} | |
pool | |
} | |
def benchmark(label: String)(f: => Unit) = { | |
val start = System.nanoTime() | |
var i = 0 | |
while (i < Iterations) { | |
f | |
i += 1 | |
} | |
val end = System.nanoTime() | |
val length = (end - start) / 1000000000.0 | |
println(label + ": " + length + " seconds") | |
} | |
def randomArray() = ArrayPool(Random.nextInt(ArrayPoolSize)) | |
} | |
object LazyNaiveAttempt { | |
import scala.annotation.tailrec | |
type Joined[A, B] = Dataset[(Option[A], Option[B])] | |
trait Dataset[A] extends (Int => A) { self => | |
def join[B, C](that: Dataset[B])(f: A => C, g: B => C) | |
(implicit ord: Ordering[C], cmA: ClassManifest[A], cmB: ClassManifest[B]): Joined[A, B] = { | |
@tailrec def loop(acc: Joined[A, B], d1: Dataset[A], d2: Dataset[B]): Joined[A, B] = { | |
(d1.uncons, d2.uncons) match { | |
case (None, None) => acc | |
case (None, Some((_, _))) => | |
acc.concat(d2.map(x => (None, Some(x)))) | |
case (Some((_, _)), None) => | |
acc.concat(d1.map(x => (Some(x), None))) | |
case (Some((x, xs)), Some((y, ys))) => | |
val cmp = ord.compare(f(x), g(y)) | |
if (cmp < 0) loop(acc.append((Some(x), None)), xs, d2) | |
else if (cmp > 0) loop(acc.append((None, Some(y))), d1, ys) | |
else loop(acc.append((Some(x), Some(y))), xs, ys) | |
} | |
} | |
loop(Dataset.empty[(Option[A], Option[B])], self.sortBy(f), that.sortBy(g)) | |
} | |
def sortBy[B](f: A => B)(implicit cm: ClassManifest[A], ord: Ordering[B]) = | |
Dataset.fromArray(toArray.sortBy(f)) | |
def append(x: A) = new Dataset[A] { | |
def apply(i: Int) = if (i < self.size) self.apply(i) else x | |
def size = self.size + 1 | |
} | |
def concat(that: Dataset[A]) = new Dataset[A] { | |
def apply(i: Int) = if (i < self.size) self(i) else that(i - self.size) | |
def size = self.size + that.size | |
} | |
def uncons: Option[(A, Dataset[A])] = { | |
if (size == 0) None | |
else Some(self(0), new Dataset[A] { | |
def apply(i: Int): A = self.apply(i + 1) | |
def size = self.size - 1 | |
}) | |
} | |
def map[B](f: A => B) = new Dataset[B] { | |
def apply(i: Int): B = f(self(i)) | |
def size = self.size | |
} | |
def fold[B](zero: B)(f: (B, A) => B): B = { | |
uncons match { | |
case None => zero | |
case Some((x, xs)) => xs.fold(f(zero, x))(f) | |
} | |
} | |
def size: Int | |
def toArray(implicit cm: ClassManifest[A]) = (0 until size).map(self).toArray | |
} | |
object Dataset { | |
def empty[A: ClassManifest] = fromArray(new Array[A](0)) | |
def fromArray[A](value: Array[A]): Dataset[A] = new Dataset[A] { | |
def apply(i: Int): A = value(i) | |
def size = value.length | |
} | |
} | |
} | |
object SmarterStrictAttempt { | |
type Joined[A, B] = Dataset[(Option[A], Option[B])] | |
case class Dataset[A: ClassManifest](value: Array[A]) extends (Int => A) { self => | |
def apply(i: Int): A = value(i) | |
def join[B, C, D](that: Dataset[B])(f: A => C, g: B => C)(left: A => D, right: B => D, both: (A, B) => D) | |
(implicit ord: Ordering[C], cmA: ClassManifest[A], cmB: ClassManifest[B], cmD: ClassManifest[D]): Dataset[D] = { | |
val builder = scala.collection.mutable.ArrayBuilder.make[D] | |
val sort1 = self.value.sortBy(f) | |
val sort2 = that.value.sortBy(g) | |
var i = 0 | |
var j = 0 | |
var more = true | |
while (more) { | |
if (i < sort1.length) { | |
if (j < sort2.length) { | |
val x = sort1(i) | |
val y = sort2(j) | |
val cmp = ord.compare(f(x), g(y)) | |
if (cmp < 0) { | |
i += 1 | |
builder += left(x) | |
} else if (cmp > 0) { | |
j += 1 | |
builder += right(y) | |
} else { | |
i += 1 | |
j += i | |
builder += both(x, y) | |
} | |
} else { | |
builder ++= sort1.drop(i).map(left) | |
more = false | |
} | |
} else { | |
builder ++= sort2.drop(j).map(right) | |
more = false | |
} | |
} | |
Dataset(builder.result()) | |
} | |
def map[B: ClassManifest](f: A => B) = Dataset(value = value.map(f)) | |
def fold[B](zero: B)(f: (B, A) => B): B = { | |
var i = 0 | |
var acc = zero | |
while (i < value.length) { | |
acc = f(acc, value(i)) | |
i += 1 | |
} | |
acc | |
} | |
def size: Int = value.length | |
} | |
} | |
import BenchmarkCommon._ | |
/* benchmark("lazy-naive") { | |
import LazyNaiveAttempt._ | |
val arr1 = Dataset.fromArray(randomArray()) | |
val arr2 = Dataset.fromArray(randomArray()) | |
arr1.map(_ * 2).join(arr2.map(_ * 2))(identity[Int], identity[Int]).fold(0) { | |
case (acc, (None, None)) => acc | |
case (acc, (None, Some(x))) => acc + x | |
case (acc, (Some(x), None)) => acc + x | |
case (acc, (Some(x), Some(y))) => acc + x + y | |
} | |
} */ | |
benchmark("smarter-strict") { | |
import SmarterStrictAttempt._ | |
val arr1 = Dataset(randomArray()) | |
val arr2 = Dataset(randomArray()) | |
arr1.map(_ * 2).join(arr2.map(_ * 2))(identity[Int], identity[Int])(identity[Int], identity[Int], _ + _).fold(0)(_ + _) | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment