Last active
February 3, 2026 20:25
-
-
Save dacr/e1f25594268f3883eb255738582eecde to your computer and use it in GitHub Desktop.
Load and process CSV content using spark. / published by https://github.com/dacr/code-examples-manager #a77196dd-c148-4f86-a7b7-e7a1b3805bc2/3835d318b5e9ec7eb7470486c38685627cf85e3f
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| // summary : Load and process CSV content using spark. | |
| // keywords : scala, feed, csv, bigdata, spark | |
| // publish : gist | |
| // authors : David Crosson | |
| // license : Apache License Version 2.0 (https://www.apache.org/licenses/LICENSE-2.0.txt) | |
| // id : a77196dd-c148-4f86-a7b7-e7a1b3805bc2 | |
| // created-on : 2020-05-31T19:54:52Z | |
| // managed-by : https://github.com/dacr/code-examples-manager | |
| // execution : scala 2.12 ammonite script (http://ammonite.io/) - run as follow 'amm scriptname.sc' | |
| /* | |
| spark 2.4.4 is only for scala 2.12, 2.5.x will bring scala 2.13 support | |
| --------------- | |
| In REPL mode use AmmoniteSparkSession instead of SparkSession | |
| import $ivy.`sh.almond::ammonite-spark:0.7.2` | |
| val spark = | |
| AmmoniteSparkSession.builder() | |
| .master("local[*]") | |
| .getOrCreate() | |
| */ | |
| import $ivy.`org.apache.spark::spark-sql:3.1.1` | |
| import $ivy.`com.github.pathikrit::better-files:3.9.1` | |
| import org.apache.spark.sql._ | |
| { | |
| import better.files._ | |
| import better.files.Dsl._ | |
| val csvFile = file"data.csv" | |
| csvFile < "id,name,age,gender\n" | |
| csvFile << "1,joe,32,1\n" | |
| csvFile << "2,sarah,42,2\n" | |
| csvFile << "3,john,4,1\n" | |
| } | |
| val spark = | |
| SparkSession.builder() | |
| .master("local[*]") | |
| .getOrCreate() | |
| def sc = spark.sparkContext | |
| val csvDataFrame = | |
| spark.read.format("csv") | |
| .option("sep", ",") | |
| .option("inferSchema", "true") | |
| .option("header", "true") | |
| .load("data.csv") | |
| assert(csvDataFrame.count() == 3, "Didnt get the right row numbers") | |
| assert(csvDataFrame.columns.size == 4, "Didnt get the right column numbers") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment