Created
March 9, 2017 23:47
-
-
Save josep2/a1360f7e67c6e24bd10bbc32ed0a26d4 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// From https://github.com/amplab/succinct | |
import edu.berkeley.cs.succinct.sql._ | |
// Create a schema | |
val citySchema = StructType(Seq( | |
StructField("Name", StringType, false), | |
StructField("Length", IntegerType, true), | |
StructField("Area", DoubleType, false), | |
StructField("Airport", BooleanType, true))) | |
// Create an RDD of Rows with some data; sc is the SparkContext | |
val cityRDD = sc.parallelize(Seq( | |
Row("San Francisco", 12, 44.52, true), | |
Row("Palo Alto", 12, 22.33, false), | |
Row("Munich", 8, 3.14, true))) | |
// Create a data frame from the RDD and the schema | |
val cityDataFrame = sqlContext.createDataFrame(cityRDD, citySchema) | |
// Save the DataFrame in the "Succinct" format | |
cityDataFrame.write.format("edu.berkeley.cs.succinct.sql").save("/path/to/data") | |
// Read the Succinct DataFrame from the saved path | |
val succinctCities = sqlContext.succinctTable("/path/to/data") | |
// Filter and prune | |
val bigCities = succinctCities.filter("Area >= 22.0").select("Name").collect | |
// Alternately, use the DataFrameReader API: | |
cityDataFrame.write.format("edu.berkeley.cs.succinct.sql").save("/path/to/data") | |
val succinctCities2 = sqlContext.read.format("edu.berkeley.cs.succinct.sql").load("/path/to/data") | |
val smallCities = succinctCities2.filter("Area <= 10.0").select("Name").collect |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment