Skip to content

Instantly share code, notes, and snippets.

@shivaram
Created July 15, 2015 18:17
Show Gist options
  • Save shivaram/fe621d1ba480c77723f9 to your computer and use it in GitHub Desktop.
Save shivaram/fe621d1ba480c77723f9 to your computer and use it in GitHub Desktop.
SparkR 1.4.1 Demo
# If you are using Spark 1.4, then launch SparkR with the command
#
# ./bin/sparkR --packages com.databricks:spark-csv_2.10:1.0.3
# as the `sparkPackages=` flag was only added in Spark 1.4.1.
# # This will work in Spark 1.4.1.
sc <- sparkR.init(spark_link, sparkPackages = "com.databricks:spark-csv_2.10:1.0.3")
sqlContext <- sparkRSQL.init(sc)
flights <- read.df(sqlContext, "s3n://sparkr-data/nycflights13.csv","com.databricks.spark.csv", header="true")
# Print the first few rows
head(flights)
# Run a query to print the top 5 most frequent destinations from JFK
jfk_flights <- filter(flights, flights$origin == "JFK")
# Group the flights by destination and aggregate by the number of flights
dest_flights <- agg(group_by(jfk_flights, jfk_flights$dest), count = n(jfk_flights$dest))
# Now sort by the `count` column and print the first few rows
head(arrange(dest_flights, desc(dest_flights$count)))
## dest count
##1 LAX 11262
##2 SFO 8204
##3 BOS 5898
# Running SQL Queries
registerTempTable(flights, "flightsTable")
delayDF <- sql(sqlContext, "SELECT dest, arr_delay FROM flightsTable")
# Creating new Columns, Deleting columns
flights$air_time_hr <- flights$air_time / 60
flights$air_time_hr <- NULL
# Combine the whole query into two lines using magrittr
library(magrittr)
dest_flights <- filter(flights, flights$origin == "JFK") %>%
group_by(flights$dest) %>%
summarize(count = n(flights$dest))
top_dests <- head(arrange(dest_flights, desc(dest_flights$count)))
barplot(top_dests$count, names.arg = top_dests$dest)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment