Skip to content

Instantly share code, notes, and snippets.

@zero323
Last active August 19, 2017 12:08
Show Gist options
  • Save zero323/8e3c50d4bbe02bc29176f4ad64e77b68 to your computer and use it in GitHub Desktop.
Save zero323/8e3c50d4bbe02bc29176f4ad64e77b68 to your computer and use it in GitHub Desktop.
import java.util.*;
import scala.Tuple2;
import scala.Tuple3;
import org.apache.spark.util.StatCounter;
import org.apache.spark.api.java.*;
import org.apache.spark.SparkConf;
public class App {
public static void main(String[] args) {
SparkConf conf = new SparkConf().set("spark.serializer", "org.apache.spark.serializer.KryoSerializer");
JavaSparkContext sc = new JavaSparkContext(conf);
List myList = Arrays.asList(
new Tuple2<>("A", 8),
new Tuple2<>("B", 3),
new Tuple2<>("C", 5),
new Tuple2<>("A", 2),
new Tuple2<>("B", 8));
JavaPairRDD<String, Integer> pairs = sc.parallelizePairs(myList);
System.out.println(
pairs
.aggregateByKey(new StatCounter(), (acc, x) -> acc.merge(x), (acc1, acc2) -> acc1.merge(acc2))
.map(x -> new Tuple3<>(x._1, x._2.mean(), x._2.stdev()))
.collect()
);
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment