| title | Long Running Calculations |
|---|---|
| output | html_notebook |
The following examples execute long running jobs in R.
This sequence will output integers from 1 to 120 once per second for 2 minutes.
| library(sparklyr) | |
| library(ggplot2) | |
| library(dplyr) | |
| # Copy data | |
| sc <- spark_connect("local", version = "1.6.1") | |
| mtcars_tbl1 <- copy_to(sc, mtcars) | |
| ### No Pipes ### |
| library(sparklyr) | |
| library(dplyr) | |
| # Setup | |
| Sys.setenv(SPARK_HOME="/usr/lib/spark") | |
| config <- spark_config() | |
| config[["spark.executor.memory"]] <- "4G" | |
| # Cache entire trips table into Spark DataFrame | |
| system.time(sc <- spark_connect(master = "yarn-client", config = config, version = '1.6.2')) # 22 seconds |
| --- | |
| title: "Analysis of flights data using Apache Spark" | |
| output: html_notebook | |
| --- | |
| ```{r setup, include=FALSE} | |
| knitr::opts_chunk$set(echo = TRUE, eval=TRUE) | |
| ``` | |
| ## Motivation |
| ## Load data from CSV into Hive | |
| CREATE EXTERNAL TABLE IF NOT EXISTS trips( | |
| id int, | |
| cab_type_id bigint, | |
| vendor_id string, | |
| pickup_datetime timestamp, | |
| dropoff_datetime timestamp, | |
| store_and_fwd_flag string, | |
| rate_code_id string, |
| library(sparklyr) | |
| library(dplyr) | |
| # Install spark and hadoop dependencies | |
| spark_install(version="2.0.0-preview", hadoop_version="2.7") | |
| # Download file | |
| fileIn <- "https://nycopendata.socrata.com/api/views/h9gi-nx95/rows.csv?accessType=DOWNLOAD" | |
| fileOut <- "NYPD_Motor_Vehicle_Collisions_RAW.csv" | |
| download.file(fileIn, fileOut) |
| --- | |
| title: "Analysis of flights data using Apache Spark" | |
| output: html_notebook | |
| --- | |
| ```{r setup, include=FALSE} | |
| knitr::opts_chunk$set(echo = TRUE, eval=TRUE) | |
| ``` | |
| ## Motivation |
| --- | |
| title: "Analysis of flights data using Apache Spark" | |
| output: html_notebook | |
| --- | |
| ```{r setup, include=FALSE} | |
| knitr::opts_chunk$set(echo = TRUE, eval=TRUE) | |
| ``` | |
| ## Motivation |
| # List files | |
| declare -a arr=( | |
| "nyc_taxi_trips_2009-01.csv" | |
| "nyc_taxi_trips_2009-02.csv" | |
| "nyc_taxi_trips_2009-03.csv" | |
| "nyc_taxi_trips_2009-04.csv" | |
| "nyc_taxi_trips_2009-05.csv" | |
| "nyc_taxi_trips_2009-06.csv" | |
| "nyc_taxi_trips_2009-07.csv" |
| ================ | |
| RSP build on AWS | |
| ================ | |
| 1. Build a new instance | |
| a. sign into aws console | |
| b. Select Ubuntu 64 bit | |
| c. Select free tier | |
| d. Add 12 GB storage |