JoshRosen · May 18, 2015 00:54
diff --git a/AggregationBenchmark.scala b/AggregationBenchmark.scala
 package org.apache.spark.sql

 import java.io.File

 import org.apache.spark.sql.catalyst.expressions.GenericRow
 import org.apache.spark.{SparkConf, SparkContext}
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.types.StructType

 import scala.util.Random

 object AggregationBenchmark {

  val numKeys = 10000
  val numRecords = numKeys * 1
  val numWarmups = 10
  val numRepetitions = 20

  var sc: SparkContext = _
  var sqlContext: SQLContext = _

  case class Record(key: String, c1: Int, c2: Long, c3: Double)

  def setup(): DataFrame = {
    val sqlContext2 = sqlContext
    import sqlContext2.implicits._
    val rdd: RDD[Row] = sc.parallelize (1 to numRecords, 100).mapPartitions { iter =>
      val rand = new Random(42)
      val arr = new Array[Any](4)
      val row = new GenericRow(arr)
      iter.map { _ =>
        arr(0) = rand.nextString(8)
        arr(1) = rand.nextInt()
        arr(2) = rand.nextLong()
        arr(3) = rand.nextDouble()
        row
      }
    }
    //rdd.count()
    val df =
      sqlContext.createDataFrame(rdd, StructType(Seq('key.string, 'c1.int, 'c2.long, 'c3.double)))
    df.registerTempTable("data")
    df
  }

  def main(args: Array[String]): Unit = {
    val conf = new SparkConf().setMaster("local[*]").setAppName("test")
      .set("spark.sql.useSerializer2", "true")
    new File("eventlogs").mkdirs()
    sc = new SparkContext(conf)
    sqlContext = new SQLContext(sc)

    setup()

    val startTime = System.currentTimeMillis()
    def runQuery(): Unit = {
      sqlContext.sql("SELECT key, sum(c1), sum(c2), sum(c3) from data GROUP BY key").count()
    }
    //  val controller = new com.yourkit.api.Controller
    (1 to numWarmups).foreach { _ => runQuery() }
 //    controller.startCPUSampling(null)
    //controller.startAllocationRecording(true, 10, false, 0, true, false)
    (1 to numRepetitions).foreach { _ => runQuery() }
 //    controller.stopCPUProfiling()
    //controller.stopAllocationRecording()
    val endTime = System.currentTimeMillis()
    println("Average time: " + ((endTime - startTime) / (1.0 * numRepetitions)))
    sc.stop()
  }
 }
diff --git a/build.sbt b/build.sbt

 scalaVersion := "2.10.4"

 val SPARK_HOME = sys.env.getOrElse("SPARK_HOME", "/Users/joshrosen/Documents/spark")

 lazy val sparkSql = ProjectRef(file(SPARK_HOME), "sql")

 lazy val sparkCore = ProjectRef(file(SPARK_HOME), "core")

 lazy val root = (project in file(".")).
  settings(
    aggregate in update := false,
    fork in run := true,
    libraryDependencies += "yourkit" % "yourkit-api" % "version" from "file:///Applications/YourKit_Java_Profiler_2013_build_13088.app/lib/yjp-controller-api-redist.jar",
    javaOptions in run +=  "-agentpath:/Applications/YourKit_Java_Profiler_2013_build_13088.app/bin/mac/libyjpagent.jnilib=onexit=snapshot",
      javaOptions in run ++= Seq(
      "-Xmx2000M",
      "-Xms2000M"
 //      "-XX:+UnlockDiagnosticVMOptions",
 //      //"-XX:+TraceClassLoading",
 //      //"-XX:+LogCompilation",
 //      //"-XX:+PrintAssembly",
 //      "-XX:+PrintInlining"
    )
  ).dependsOn(sparkSql, sparkCore)
	package org.apache.spark.sql

	import java.io.File

	import org.apache.spark.sql.catalyst.expressions.GenericRow
	import org.apache.spark.{SparkConf, SparkContext}
	import org.apache.spark.rdd.RDD
	import org.apache.spark.sql.types.StructType

	import scala.util.Random

	object AggregationBenchmark {

	val numKeys = 10000
	val numRecords = numKeys * 1
	val numWarmups = 10
	val numRepetitions = 20

	var sc: SparkContext = _
	var sqlContext: SQLContext = _

	case class Record(key: String, c1: Int, c2: Long, c3: Double)

	def setup(): DataFrame = {
	val sqlContext2 = sqlContext
	import sqlContext2.implicits._
	val rdd: RDD[Row] = sc.parallelize (1 to numRecords, 100).mapPartitions { iter =>
	val rand = new Random(42)
	val arr = new Array[Any](4)
	val row = new GenericRow(arr)
	iter.map { _ =>
	arr(0) = rand.nextString(8)
	arr(1) = rand.nextInt()
	arr(2) = rand.nextLong()
	arr(3) = rand.nextDouble()
	row
	}
	}
	//rdd.count()
	val df =
	sqlContext.createDataFrame(rdd, StructType(Seq('key.string, 'c1.int, 'c2.long, 'c3.double)))
	df.registerTempTable("data")
	df
	}

	def main(args: Array[String]): Unit = {
	val conf = new SparkConf().setMaster("local[*]").setAppName("test")
	.set("spark.sql.useSerializer2", "true")
	new File("eventlogs").mkdirs()
	sc = new SparkContext(conf)
	sqlContext = new SQLContext(sc)

	setup()

	val startTime = System.currentTimeMillis()
	def runQuery(): Unit = {
	sqlContext.sql("SELECT key, sum(c1), sum(c2), sum(c3) from data GROUP BY key").count()
	}
	// val controller = new com.yourkit.api.Controller
	(1 to numWarmups).foreach { _ => runQuery() }
	// controller.startCPUSampling(null)
	//controller.startAllocationRecording(true, 10, false, 0, true, false)
	(1 to numRepetitions).foreach { _ => runQuery() }
	// controller.stopCPUProfiling()
	//controller.stopAllocationRecording()
	val endTime = System.currentTimeMillis()
	println("Average time: " + ((endTime - startTime) / (1.0 * numRepetitions)))
	sc.stop()
	}
	}

	scalaVersion := "2.10.4"

	val SPARK_HOME = sys.env.getOrElse("SPARK_HOME", "/Users/joshrosen/Documents/spark")

	lazy val sparkSql = ProjectRef(file(SPARK_HOME), "sql")

	lazy val sparkCore = ProjectRef(file(SPARK_HOME), "core")

	lazy val root = (project in file(".")).
	settings(
	aggregate in update := false,
	fork in run := true,
	libraryDependencies += "yourkit" % "yourkit-api" % "version" from "file:///Applications/YourKit_Java_Profiler_2013_build_13088.app/lib/yjp-controller-api-redist.jar",
	javaOptions in run += "-agentpath:/Applications/YourKit_Java_Profiler_2013_build_13088.app/bin/mac/libyjpagent.jnilib=onexit=snapshot",
	javaOptions in run ++= Seq(
	"-Xmx2000M",
	"-Xms2000M"
	// "-XX:+UnlockDiagnosticVMOptions",
	// //"-XX:+TraceClassLoading",
	// //"-XX:+LogCompilation",
	// //"-XX:+PrintAssembly",
	// "-XX:+PrintInlining"
	)
	).dependsOn(sparkSql, sparkCore)