Created
June 21, 2017 20:42
-
-
Save girisandeep/fddf49ef97fde429a0d3256160b257c1 to your computer and use it in GitHub Desktop.
Instead of using map, it uses mapPartitions. It is better than csv-parsing-ex2.scala
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
//CSV parsing program using opencsv library | |
//spark-shell --packages net.sf.opencsv:opencsv:2.3 | |
//Or | |
//Add this to sbt: libraryDependencies += "net.sf.opencsv" % "opencsv" % "2.3" | |
import au.com.bytecode.opencsv.CSVParser | |
var linesRdd = sc.textFile("/data/spark/temps.csv"); | |
def parseCSV(itr:Iterator[String]):Iterator[Array[String]] = { | |
val parser = new CSVParser(',') | |
for(line <- itr) | |
yield parser.parseLine(line) | |
} | |
//Check with simple example | |
val x = parseCSV(Array("1,2,3","a,b,c").iterator) | |
linesRdd.mapPartitions(parseCSV) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment