Last active
April 2, 2023 10:10
-
-
Save dacr/e1f25594268f3883eb255738582eecde to your computer and use it in GitHub Desktop.
Load and process CSV content using spark. / published by https://github.com/dacr/code-examples-manager #a77196dd-c148-4f86-a7b7-e7a1b3805bc2/8e707313d3fe0ce549436b8b6d7e9d3436bb6cf7
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// summary : Load and process CSV content using spark. | |
// keywords : scala, feed, csv, bigdata, spark | |
// publish : gist | |
// authors : David Crosson | |
// license : Apache NON-AI License Version 2.0 (https://raw.githubusercontent.com/non-ai-licenses/non-ai-licenses/main/NON-AI-APACHE2) | |
// id : a77196dd-c148-4f86-a7b7-e7a1b3805bc2 | |
// created-on : 2020-05-31T19:54:52Z | |
// managed-by : https://github.com/dacr/code-examples-manager | |
// execution : scala 2.12 ammonite script (http://ammonite.io/) - run as follow 'amm scriptname.sc' | |
/* | |
spark 2.4.4 is only for scala 2.12, 2.5.x will bring scala 2.13 support | |
--------------- | |
In REPL mode use AmmoniteSparkSession instead of SparkSession | |
import $ivy.`sh.almond::ammonite-spark:0.7.2` | |
val spark = | |
AmmoniteSparkSession.builder() | |
.master("local[*]") | |
.getOrCreate() | |
*/ | |
import $ivy.`org.apache.spark::spark-sql:3.1.1` | |
import $ivy.`com.github.pathikrit::better-files:3.9.1` | |
import org.apache.spark.sql._ | |
{ | |
import better.files._ | |
import better.files.Dsl._ | |
val csvFile = file"data.csv" | |
csvFile < "id,name,age,gender\n" | |
csvFile << "1,joe,32,1\n" | |
csvFile << "2,sarah,42,2\n" | |
csvFile << "3,john,4,1\n" | |
} | |
val spark = | |
SparkSession.builder() | |
.master("local[*]") | |
.getOrCreate() | |
def sc = spark.sparkContext | |
val csvDataFrame = | |
spark.read.format("csv") | |
.option("sep", ",") | |
.option("inferSchema", "true") | |
.option("header", "true") | |
.load("data.csv") | |
assert(csvDataFrame.count() == 3, "Didnt get the right row numbers") | |
assert(csvDataFrame.columns.size == 4, "Didnt get the right column numbers") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment