Created
October 10, 2022 20:31
-
-
Save dmateusp/9fc7153af39137e7c6809e07efb35bff to your computer and use it in GitHub Desktop.
Sorting a Dataframe before writing it out in Spark to compare file sizes
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
lazy val root = (project in file(".")) | |
.settings( | |
name := "Scala", | |
libraryDependencies += "org.apache.spark" %% "spark-sql" % "3.3.0", | |
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package main | |
import org.apache.spark.sql.SparkSession | |
import org.apache.spark.sql.SaveMode | |
case class User(id: Int, `type`: String) | |
object Main { | |
val spark = SparkSession | |
.builder() | |
.appName("Spark SQL basic example") | |
.master("local[*]") | |
.getOrCreate() | |
def main(args: Array[String]): Unit = { | |
val df = spark.createDataFrame(Stream.range(0, 10000).map(i => User(id=i, `type`=i%3 match { | |
case 1 => "EXISTING" | |
case 2 => "RETURNING" | |
case default => "NEW" | |
}))) | |
df.sort("type").write.mode(SaveMode.Overwrite).option("compression", "none").parquet("/tmp/sorted.parquet") | |
df.write.mode(SaveMode.Overwrite).option("compression", "none").parquet("/tmp/unsorted.parquet") | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment