tmcgrath · December 2, 2014 14:41 · sujitpal · Aug 18, 2015 · Sureei · Sep 10, 2015
diff --git a/Spark aggregateByKey b/Spark aggregateByKey
 Welcome to
      ____              __
     / __/__  ___ _____/ /__
    _\ \/ _ \/ _ `/ __/  '_/
   /___/ .__/\_,_/_/ /_/\_\   version 1.1.0
      /_/

 Using Scala version 2.10.4 (Java HotSpot(TM) 64-Bit Server VM, Java 1.6.0_65)
 Type in expressions to have them evaluated.
 Type :help for more information.
 2014-12-02 08:40:25.812 java[2479:1607] Unable to load realm mapping info from SCDynamicStore
 14/12/02 08:40:25 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
 Spark context available as sc.

 scala> val babyNamesCSV = sc.parallelize(List(("David", 6), ("Abby", 4), ("David", 5), ("Abby", 5)))
 babyNamesCSV: org.apache.spark.rdd.RDD[(String, Int)] = ParallelCollectionRDD[0] at parallelize at <console>:12

 scala> babyNamesCSV.reduceByKey((n,c) => n + c).collect
 res0: Array[(String, Int)] = Array((Abby,9), (David,11))

 scala> babyNamesCSV.aggregateByKey(0)((k,v) => v.toInt+k, (v,k) => k+v).collect
 res1: Array[(String, Int)] = Array((Abby,9), (David,11))
	Welcome to
	____ __
	/ __/__ ___ _____/ /__
	_\ \/ _ \/ _ `/ __/ '_/
	/___/ .__/\_,_/_/ /_/\_\ version 1.1.0
	/_/

	Using Scala version 2.10.4 (Java HotSpot(TM) 64-Bit Server VM, Java 1.6.0_65)
	Type in expressions to have them evaluated.
	Type :help for more information.
	2014-12-02 08:40:25.812 java[2479:1607] Unable to load realm mapping info from SCDynamicStore
	14/12/02 08:40:25 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
	Spark context available as sc.

	scala> val babyNamesCSV = sc.parallelize(List(("David", 6), ("Abby", 4), ("David", 5), ("Abby", 5)))
	babyNamesCSV: org.apache.spark.rdd.RDD[(String, Int)] = ParallelCollectionRDD[0] at parallelize at <console>:12

	scala> babyNamesCSV.reduceByKey((n,c) => n + c).collect
	res0: Array[(String, Int)] = Array((Abby,9), (David,11))

	scala> babyNamesCSV.aggregateByKey(0)((k,v) => v.toInt+k, (v,k) => k+v).collect
	res1: Array[(String, Int)] = Array((Abby,9), (David,11))