Created
October 26, 2015 11:00
-
-
Save hrbrmstr/788a6fd4e2dc80d5d3db to your computer and use it in GitHub Desktop.
Sample code for working with Apache Spark (v1.4), SparkR and ParquetFiles from RStudio
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# see github repos & package documentation | |
# - http://github.com/apache/spark/tree/master/R | |
# - http://spark.apache.org/docs/latest/api/R/ | |
# install the SparkR package | |
devtools::install_github("apache/spark", ref="master", subdir="R/pkg") | |
# load the SparkR & ggplot2 packages | |
library('SparkR') | |
library('ggplot2') | |
# initialize sparkContext which starts a new Spark session | |
sc <- sparkR.init(master="local") | |
# initialize sqlContext | |
sq <- sparkRSQL.init(sc) | |
# load a parquet file into a Spark DataFrame (available @ https://dl.dropboxusercontent.com/u/101047187/gp-reg-clean.zip) | |
df1 <- parquetFile(sq, "Downloads/opendata/hscic-gpreg/gp-reg-clean") | |
# print the schema | |
printSchema(df1) | |
# print the first 20 rows, then the first 50 rows | |
showDF(df1) | |
showDF(df1, numRows=50) | |
# run aggregate query on Spark DataFrame | |
df2 <- agg(groupBy(df1, "ccg_code"), "patients" = sum(df1$patients)) | |
# collect all the elements of Spark DataFrame and coerce into an R data.frame | |
df3 <- collect(df2) | |
# visualise R data.frame | |
qplot(ccg_code, patients, data = df3) | |
# terminate Spark session | |
sparkR.stop() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment