Skip to content

Instantly share code, notes, and snippets.

@dmateusp
Created October 10, 2022 20:31
Show Gist options
  • Save dmateusp/9fc7153af39137e7c6809e07efb35bff to your computer and use it in GitHub Desktop.
Save dmateusp/9fc7153af39137e7c6809e07efb35bff to your computer and use it in GitHub Desktop.
Sorting a Dataframe before writing it out in Spark to compare file sizes
lazy val root = (project in file("."))
.settings(
name := "Scala",
libraryDependencies += "org.apache.spark" %% "spark-sql" % "3.3.0",
)
package main
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.SaveMode
case class User(id: Int, `type`: String)
object Main {
val spark = SparkSession
.builder()
.appName("Spark SQL basic example")
.master("local[*]")
.getOrCreate()
def main(args: Array[String]): Unit = {
val df = spark.createDataFrame(Stream.range(0, 10000).map(i => User(id=i, `type`=i%3 match {
case 1 => "EXISTING"
case 2 => "RETURNING"
case default => "NEW"
})))
df.sort("type").write.mode(SaveMode.Overwrite).option("compression", "none").parquet("/tmp/sorted.parquet")
df.write.mode(SaveMode.Overwrite).option("compression", "none").parquet("/tmp/unsorted.parquet")
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment