saswata-dutta · February 26, 2021 12:34
diff --git a/spark_latest_record.scala b/spark_latest_record.scala
 val df = Seq(
 (1,"foo", 123L),
 (2,"foo", 123L),
 (3,"foo", 123L),
 (4,"foo", 123L),
 (3,"foo", 124L),
 (2,"foo", 123L),
 (1,"foo", 122L),
 (1,"foo", 120L)
 ).toDF("id", "meta", "time_stamp").
 repartition(5)


 // inefficient in case of significant data skew
 import org.apache.spark.sql.expressions.Window
 val w = Window.partitionBy($"id").orderBy($"time_stamp".desc)
 val dfTopRows1 = df.withColumn("rn", row_number.over(w)).where($"rn" === 1).drop("rn")


 // use joins and aggregates
 val dfMax = df.groupBy($"id").agg(max($"time_stamp").as("max_time_stamp"))

 val dfTopRows2 = df.join(broadcast(dfMax), Seq("id"), "inner").
    where($"time_stamp" === $"max_time_stamp").
    drop("max_time_stamp").
    dropDuplicates("id")
	val df = Seq(
	(1,"foo", 123L),
	(2,"foo", 123L),
	(3,"foo", 123L),
	(4,"foo", 123L),
	(3,"foo", 124L),
	(2,"foo", 123L),
	(1,"foo", 122L),
	(1,"foo", 120L)
	).toDF("id", "meta", "time_stamp").
	repartition(5)


	// inefficient in case of significant data skew
	import org.apache.spark.sql.expressions.Window
	val w = Window.partitionBy($"id").orderBy($"time_stamp".desc)
	val dfTopRows1 = df.withColumn("rn", row_number.over(w)).where($"rn" === 1).drop("rn")


	// use joins and aggregates
	val dfMax = df.groupBy($"id").agg(max($"time_stamp").as("max_time_stamp"))

	val dfTopRows2 = df.join(broadcast(dfMax), Seq("id"), "inner").
	where($"time_stamp" === $"max_time_stamp").
	drop("max_time_stamp").
	dropDuplicates("id")