pavlov99 · February 22, 2017 02:53
diff --git a/group-additional-count.scala b/group-additional-count.scala
 // This method uses Window function to eliminate double counting of objects, which belong to multiple groups.
 // `groups` is a DataSet with two columns: id and group. The first column identifies the object, the second is a group name.
 // One of the usage of this method is customer segmentation.
 val disjointGroups = groups
  .withColumn("_rank", dense_rank().over(org.apache.spark.sql.expressions.Window.partitionBy("id").orderBy("group")))
  .filter($"_rank" === 1).drop("_rank")

 // Show disjoint groups with additional count.
 disjointGroups
  .groupBy("group")
  .agg(countDistinct("id") as "number_records")
  .orderBy("group")
  .show()
	// This method uses Window function to eliminate double counting of objects, which belong to multiple groups.
	// `groups` is a DataSet with two columns: id and group. The first column identifies the object, the second is a group name.
	// One of the usage of this method is customer segmentation.
	val disjointGroups = groups
	.withColumn("_rank", dense_rank().over(org.apache.spark.sql.expressions.Window.partitionBy("id").orderBy("group")))
	.filter($"_rank" === 1).drop("_rank")

	// Show disjoint groups with additional count.
	disjointGroups
	.groupBy("group")
	.agg(countDistinct("id") as "number_records")
	.orderBy("group")
	.show()