jomoespe · February 14, 2019 07:28
diff --git a/mergeParquet.scala b/mergeParquet.scala
 val partitions = 5;   // this value depends on data and volumes. Will be different in every case.
 val df = sqlContext.read.json(“URI://path/to/parquet/files/")
 df.createOrReplaceTempView("df")
 val df_output = spark
  .sql("SELECT DISTINCT * FROM df") // this removes duplicates. If it's not needed, simply remove this line
  .coalesce(partitions)
 df_output.write.parquet("URI://path/to/destination")
	val partitions = 5; // this value depends on data and volumes. Will be different in every case.
	val df = sqlContext.read.json(“URI://path/to/parquet/files/")
	df.createOrReplaceTempView("df")
	val df_output = spark
	.sql("SELECT DISTINCT * FROM df") // this removes duplicates. If it's not needed, simply remove this line
	.coalesce(partitions)
	df_output.write.parquet("URI://path/to/destination")