Skip to content

Instantly share code, notes, and snippets.

@josep2
Created January 9, 2017 03:28
Show Gist options
  • Save josep2/c8b520d5b3d87e59f598f45e5fc2832d to your computer and use it in GitHub Desktop.
Save josep2/c8b520d5b3d87e59f598f45e5fc2832d to your computer and use it in GitHub Desktop.
import org.apache.spark.sql.SparkSession
import org.apache.spark.storage.StorageLevel
case class Panel(user_id: String, date_joined: String, zip_shipping: String, date_newest_receipt: String,
date_oldest_receipt: String, prop_30d_syncable: String, date_last_sync: String, isp: String,
syncable: Int)
object Demo extends App {
import sparkSession.implicits._
// s3 Configurations. Should store them somewhere else for safety
sparkSession.sparkContext.hadoopConfiguration.set("fs.s3n.awsAccessKeyId", key)
sparkSession.sparkContext.hadoopConfiguration.set("fs.s3n.awsSecretAccessKey", secret)
val zipFile = sparkSession.sparkContext.textFile("s3n://PATH_TO_AMAZON_S3_ZIP_FILE")
.map(_.split("\\t"))
.map(line => Panel(line(0),line(1),line(3),line(4),line(5),line(6),line(7),line(8),line(9).toInt))
println(zipFile.count())
val zipFile2 = sparkSession.sparkContext.textFile("alluxio://PATH_TO_ALLUXIO_ZIP_FILE")
.map(_.split("\\t"))
.map(line => Panel(line(0),line(1),line(3),line(4),line(5),line(6),line(7),line(8),line(9).toInt))
println(zipFile2.count())
sparkSession.sparkContext.stop()
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment