Created
November 20, 2021 06:08
-
-
Save luketn/d29dd425992efff19874d4fa86260343 to your computer and use it in GitHub Desktop.
Convert CSV to Parquet
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package com.thing; | |
import com.google.common.io.Files; | |
import org.apache.spark.sql.Dataset; | |
import org.apache.spark.sql.Row; | |
import org.apache.spark.sql.SparkSession; | |
import lombok.extern.slf4j.Slf4j; | |
import java.awt.*; | |
import java.io.File; | |
import java.io.IOException; | |
import java.nio.file.Path; | |
import java.nio.file.Paths; | |
@Slf4j | |
public class CsvToParquet { | |
public static void main(String[] args) throws IOException { | |
// Build the Spark Session | |
SparkSession spark = SparkSession.builder() | |
.appName("CSV to Parquet") | |
.master("local") | |
//.config('job.local.dir', '/Users/luketn/WriteParquet/data/) \ | |
.getOrCreate(); | |
// Read the CSV file into Data Frame | |
Dataset<Row> df = spark.read() | |
.format("csv") | |
.option("header", "true") | |
.load("data/weather.csv"); | |
df.show(5); | |
log.info("The dataframe has " + df.count() + " rows."); | |
File tempDir = Files.createTempDir(); | |
Path parquetPath = Paths.get(tempDir.getAbsolutePath(), "weather.parquet"); | |
// Write as Parquet | |
df.write() | |
.parquet(parquetPath.toString()); | |
log.info("Written Parquet File to \n" + parquetPath); | |
// Reads a Parquet back into Data Frame | |
Dataset<Row> pdf = spark.read() | |
.format("parquet") | |
.load(parquetPath.toString()); | |
pdf.show(10); | |
pdf.printSchema(); | |
log.info("The Parquet dataframe has {} rows", pdf.count()); | |
// Now save a JSON | |
Path jsonPath = Paths.get(tempDir.getAbsolutePath(), "weather.json"); | |
pdf.write() | |
.json(jsonPath.toString()); | |
log.info("Written file as JSON to\n" + jsonPath); | |
Desktop.getDesktop().open(tempDir.getAbsoluteFile()); | |
} | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
outlook | temperature | humidity | windy | play | |
---|---|---|---|---|---|
sunny | 85 | 85 | FALSE | no | |
sunny | 80 | 90 | TRUE | no | |
overcast | 83 | 86 | FALSE | yes | |
rainy | 70 | 96 | FALSE | yes | |
rainy | 68 | 80 | FALSE | yes | |
rainy | 65 | 70 | TRUE | no | |
overcast | 64 | 65 | TRUE | yes | |
sunny | 72 | 95 | FALSE | no | |
sunny | 69 | 70 | FALSE | yes | |
rainy | 75 | 80 | FALSE | yes | |
sunny | 75 | 70 | TRUE | yes | |
overcast | 72 | 90 | TRUE | yes | |
overcast | 81 | 75 | FALSE | yes | |
rainy | 71 | 91 | TRUE | no |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment