Created
August 19, 2016 03:30
-
-
Save vgiri2015/edd5952a6b22dc49c4ac91f2b25aa6a3 to your computer and use it in GitHub Desktop.
File Compression in Spark 2.0
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import org.apache.spark.sql.{SQLContext, SparkSession} | |
import org.apache.spark.{SparkConf, SparkContext} | |
/** | |
* Created by vgiridatabricks on 8/13/16. | |
*/ | |
object FileCompression { | |
case class DataFrameSample(name: String, actor: String, episodeDebut: String) | |
def main(args: Array[String]): Unit = { | |
val spark = SparkSession | |
.builder() | |
.appName("Spark File Compression Handling") | |
.master("local") | |
.getOrCreate() | |
val df = spark.createDataFrame( | |
DataFrameSample("Homer", "Dan Castellaneta", "Good Night") :: | |
DataFrameSample("Marge", "Julie Kavner", "Good Night") :: | |
DataFrameSample("Bart", "Nancy Cartwright", "Good Night") :: | |
DataFrameSample("Lisa", "Yeardley Smith", "Good Night") :: | |
DataFrameSample("Maggie", "Liz Georges and more", "Good Night") :: | |
DataFrameSample("Sideshow Bob", "Kelsey Grammer", "The Telltale Head") :: | |
Nil).toDF().cache() | |
df.write.mode("overwrite").format("parquet").option("compression", "none").mode("overwrite").save("/tmp/file_no_compression_parq") | |
df.write.mode("overwrite").format("parquet").option("compression", "gzip").mode("overwrite").save("/tmp/file_with_gzip_parq") | |
df.write.mode("overwrite").format("parquet").option("compression", "snappy").mode("overwrite").save("/tmp/file_with_snappy_parq") | |
//lzo - requires a different method in terms of implementation. | |
df.write.mode("overwrite").format("orc").option("compression", "none").mode("overwrite").save("/tmp/file_no_compression_orc") | |
df.write.mode("overwrite").format("orc").option("compression", "snappy").mode("overwrite").save("/tmp/file_with_snappy_orc") | |
df.write.mode("overwrite").format("orc").option("compression", "zlib").mode("overwrite").save("/tmp/file_with_zlib_orc") | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment