daroczig · October 19, 2017 12:28
diff --git a/CRUNCH-2017-R-workshop.R b/CRUNCH-2017-R-workshop.R
 ## intro slides: http://bit.ly/CRUNCH-R-2017

 ## basic operations
 1 + 3
 3*2
 3^2

 ## constants
 pi
 "pi"
 'pi'
 letters
 LETTERS
 month.name
 month.abb

 ## help
 ?pi
 apropos('month')
 apropos('cor')

 ## variables
 x = 4
 x <- 4
 x

 x / 2
 x^2
 x^0.5

 ## functions
 sqrt(x)
 sqrt(x = x)
 sin(pi / 2)
 round(pi)
 ceiling(pi)
 floor(pi)

 ## custom functions
 f <- function(x) 2 * x + 1
 f(5)
 f(pi)

 ## vectors
 c(1, 2, 3, 4, 5)
 1:5
 seq(1, 5)
 ?seq
 seq(1, 5, 0.1)

 x <- seq(1, 5, 0.1)
 plot(x, f(x))
 plot(x, f(x), type = 'l')

 plot(x, f(x), type = 'l', xlab = '', main = '2x+1')
 grid()


 ## TODO plot 1 period of sine curve
 ?sin
 f <- function(x) sin(x)

 x <- seq(0, pi * 2, 0.1)
 plot(x, sin(x), type = 'l')

 ## a simpler way for plotting functions
 curve(f)
 ?curve

 curve(2*x + 1, from = 0, to = 50)
 curve(x + 1, add = TRUE, col = 'red')

 ## TODO plot 1 period of sine curve
 curve(sin, from = 0, to = 2*pi)

 ## TODO Brownian motion in 1D
 runif(1)
 runif(5)

 ## random numbers from uniform distribution
 round(runif(5))
 round(runif(5))*2 - 1

 cumsum(round(runif(25))*2 - 1)

 plot(cumsum(round(runif(25))*2 - 1), type = 's')

 set.seed(42)
 plot(cumsum(round(runif(25))*2 - 1), type = 's')

 for (i in 2:6) {
    lines(cumsum(round(runif(25))*2 - 1), type = 's', col = i)
 }

 ## pro tipp #1
 plot(c(1, 25), c(-25, 25), type = 'n')
 plot(NULL, NULL, xlim = c(1, 25), ylim = c(-25, 25))

 ## pro tipp #2
 sample(c(-1, 1), 25, replace = TRUE)

 ################################################################################
 ## custom vectors -> combine values into vector
 h <- c(174, 170, 160)
 h
 (w <- c(90, 80, 70))

 plot(h, w)
 plot(h, w, main = "Demo", xlab = 'Height', ylab = 'Weight')

 cor(w, h)

 lm(w ~ h) # formula notation
 fit <- lm(w ~ h)
 summary(fit)

 plot(h, w, main = "Demo", xlab = 'Height', ylab = 'Weight')
 abline(fit, col = 'red')

 ## some basic modeling
 predict(fit, list(h = c(56, 104)))

 56 * fit$coefficients[2] + fit$coefficients[1]
 56 * 1.3462 + -146.1538

 ## why?
 plot(h, w, main = "Demo", xlab = 'Height', ylab = 'Weight',
     xlim = c(0, max(h)),
     ylim = c(0, max(w)))
 abline(fit, col = 'red')

 ## polynomial model
 fit <- lm(w ~ poly(h, 2, raw = TRUE))
 fit <- lm(w ~ h + I(h^2))
 fit

 2824 + 174 * -34.3571 + (174^2)*0.1071

 plot(h, w)
 lines(h, predict(fit), col = 'red')

 predict(fit, data.frame(h = c(56, 104)))

 plot(0:200, predict(fit, list(h = 0:200)), type = 'l')
 abline(v = c(160, 174), col = 'red')

 ## data.frame
 df <- data.frame(weight = w, height = h)
 df
 str(df)
 df$weight
 ?'$'

 plot(df)
 cor(df)

 lm(weight ~ height, data = df)

 str(df)
 df[i, j]
 df[1, ]
 df[, 1]
 df[3, 2]
 df$weight
 df$weight[2]

 ## Body Mass Index (BMI) is a person's weight in kilograms divided by the square of height in meters.
 df$height <- df$height/100
 df
 df$bmi <- df$weight / df$height^2
 df
 df$bmi <- df$weight / (df$height/100)^2

 ## import data
 df <- read.csv('http://bit.ly/heightweight')
 str(df)

 ## TODO
 df$height <- df$heightIn * 2.54
 df$weight <- df$weightLb * 0.45
 str(df)

 df$heightIn <- df$weightLb <- NULL
 str(df)

 ## descriptive stats
 min(df$ageYear)
 max(df$ageYear)
 range(df$ageYear)
 sum(df$weightLb)
 length(df$weight)
 nrow(df)
 ncol(df)
 dim(df)

 ## TODO plot and analyze association between height and weight
 plot(df$height, df$weight)
 abline(lm(weight ~ height, data = df), col = 'red')

 ## TODO compute and plot BMI index
 df$bmi <- df$weight / (df$height/100)^2
 hist(df$bmi)

 ## exploratory data analysis
 pairs(df)

 library(GGally)
 ggpairs(df)

 library(pairsD3)
 pairsD3(df)

 ## statistical test: t-test, ANOVA
 t.test(height ~ sex, data = df)

 aov(height ~ sex, data = df)
 summary(aov(height ~ sex, data = df))
 summary(aov(weight ~ sex, data = df))

 ## Post hoc tests => Tukey Honest Significant Differences
 TukeyHSD(aov(height ~ sex, data = df))
 TukeyHSD(aov(weight ~ sex, data = df))

 ## #############################################################################
 ## Wilkinson: The Grammar of Graphics
 ## Wickham: ggplot2 -> layer by layer:
 ##  - coordinate system,
 ##  - aesthetics (labels)
 ##  - scales
 ##  - geometries
 ##  - shapes
 ##  - statistics
 ##  - grid, color theme etc. + layers
 ## http://www.cookbook-r.com/Graphs/
 ## #############################################################################

 library(ggplot2)
 ?diamonds

 ## barplot
 ggplot(diamonds, aes(x = cut)) + geom_bar()
 ggplot(diamonds, aes(x = cut)) + geom_bar() + theme_bw()
 ggplot(diamonds, aes(x = cut)) +
    geom_bar(colour = "darkgreen", fill = "white") +
    theme_bw()

 p <- ggplot(diamonds, aes(x = cut)) +
    geom_bar(colour = "darkgreen", fill = "white") + theme_bw()
 p

 ## coord transformations
 library(scales)
 p + scale_y_log10()
 p + scale_y_sqrt()
 p + scale_y_reverse()
 p + coord_flip()

 ## facet
 p + facet_wrap( ~ clarity)
 p + facet_grid(color ~ clarity)

 ## stacked, clustered bar charts
 p <- ggplot(diamonds, aes(x = color, fill = cut))
 p + geom_bar()
 p + geom_bar(position = "fill")
 p + geom_bar(position = "dodge")

 ## histogram
 ggplot(diamonds, aes(x = price)) + geom_bar(binwidth = 100)
 ggplot(diamonds, aes(x = price)) + geom_bar(binwidth = 1000)
 ggplot(diamonds, aes(x = price)) + geom_bar(binwidth = 10000)

 ggplot(diamonds, aes(price, fill = cut)) + geom_density(alpha = 0.2) + theme_bw()

 ## scatterplot
 p <- ggplot(diamonds, aes(x = carat, y = price)) + geom_point()

 ## adding layers
 p
 p <- p + geom_smooth()
 p
 p + geom_smooth(method = "lm", se = FALSE, color = 'red')

 ggplot(diamonds, aes(x = carat, y = price, colour = cut)) + geom_point() + geom_smooth()


 ## themes
 library(ggthemes)
 p + theme_economist() + scale_colour_economist()
 p + theme_stata() + scale_colour_stata()
 p + theme_excel() + scale_colour_excel()
 p + theme_wsj() + scale_colour_wsj("colors6", "")
 p + theme_gdocs() + scale_colour_gdocs()

 ## Further reading at http://www.cookbook-r.com/Graphs/

 ## TODO on mtcars
 ##  * number of carburators
 ggplot(mtcars, aes(x = carb)) + geom_bar()
 ##  * hp
 ggplot(mtcars, aes(x = hp)) + geom_histogram()
 ##  * barplot of carburators per am
 ggplot(mtcars, aes(x = carb)) + geom_bar() + facet_grid(. ~ am)
 ## stacked bar chart
 ggplot(mtcars, aes(x = carb, fill = factor(am))) + geom_bar()
 ## grouped/dodged
 ggplot(mtcars, aes(x = carb, fill = factor(am))) + geom_bar(position = 'dodge')
 ##  * boxplot of hp by carb
 ggplot(mtcars, aes(x = factor(am), y = hp)) + geom_boxplot()
 ##  * hp and wt by carb
 ggplot(mtcars, aes(x = hp, y = wt)) + geom_point() + facet_grid(. ~ carb)
 ##  * hp and wt by carb regression
 ggplot(mtcars, aes(x = hp, y = wt)) + geom_point() + facet_grid(. ~ carb) + geom_smooth()

 ## #############################################################################
 ## PCA demo on image processing
 ## #############################################################################

 ## http://www.nasa.gov/images/content/694811main_pia16225-43_full.jpg
 ## http://bit.ly/BudapestBI-R-img
 download.file('http://bit.ly/BudapestBI-R-img', 'image.jpg')

 library(jpeg)
 img <- readJPEG('image.jpg')
 str(img)

 dim(img)
 h <- dim(img)[1]
 w <- dim(img)[2]
 h;w

 img1d <- matrix(img, h * w)
 str(img1d)

 pca <- prcomp(img1d)
 pca
 summary(pca)

 image(matrix(pca$x[, 1], h))
 image(matrix(pca$x[, 2], h))
 image(matrix(pca$x[, 2], h), col = gray.colors(100))

 image(matrix(pca$x[, 3], h))

 ## #############################################################################
 ## MDS
 ## #############################################################################

 ## distance between 40 Hungarian cities -> 2D scatterplot
 download.file('http://bit.ly/CEU-R-distances', 'cities.xls')

 library(readxl)
 cities <- read_excel('cities.xls')
 str(cities)

 ## get rid of 1st column and last row (metadata)
 cities <- cities[, -1]
 cities <- cities[-nrow(cities), ]

 mds <- cmdscale(as.dist(cities))
 mds

 plot(mds)
 text(mds[, 1], mds[, 2], names(cities))

 ## flipping both x and y axis
 mds <- -mds
 plot(mds)
 text(mds[, 1], mds[, 2], names(cities))

 ## distance between European cities -> 2D scatterplot on ggplot

 mds <- cmdscale(eurodist)
 mds

 mds <- as.data.frame(mds)
 library(ggplot2)
 ggplot(mds, aes(V1, -V2, label = rownames(mds))) + geom_text()

 ## non-geo stuff
 str(mtcars)
 ?mtcars
 mds <- cmdscale(dist(mtcars))
 plot(mds)
 text(mds[, 1], mds[, 2], rownames(mtcars))

 mds <- as.data.frame(mds)
 library(ggplot2)
 ggplot(mds, aes(V1, -V2, label = rownames(mtcars))) +
    geom_text() + theme_bw()

 library(ggrepel)
 ggplot(mds, aes(V1, -V2, label = rownames(mtcars))) +
    geom_text_repel() + theme_bw()

 ## #############################################################################
 ## feature selection - same in 2D
 ## #############################################################################

 str(iris)
 plot(iris$Sepal.Length, iris$Sepal.Width)

 fit <- lm(Sepal.Width ~ Sepal.Length, data = iris)
 fit
 summary(fit)

 plot(iris$Sepal.Length, iris$Sepal.Width)
 abline(fit, col = 'red', lwd = 5)

 ## TODO improve the model
 summary(lm(Sepal.Width ~ Sepal.Length + Species,
           data = iris))

 library(ggplot2)
 ggplot(iris, aes(Sepal.Length, Sepal.Width,
                 col = Species)) +
    geom_point() + geom_smooth(method = 'lm')

 ggplot(iris, aes(Sepal.Length, Sepal.Width)) +
    geom_point(aes(col = Species)) +
    geom_smooth(aes(col = Species), method = 'lm') +
    geom_smooth(method = 'lm', col = 'black') + theme_bw()

 ## #############################################################################
 ## PCA demo to show the steps in hierarchical clustering
 ## #############################################################################

 PC <- prcomp(iris[, 1:4])$x
 dm <- dist(iris[, 1:4])
 hc <- hclust(dm)
 plot(hc)

 for (i in 2:8) {
    plot(hc)
    rect.hclust(hc, k = i, border = 'red')
    Sys.sleep(1)
 }

 library(animation)
 ani.options(interval = 1)
 saveGIF({
    for (i in 2:8) {
        plot(hc)
        rect.hclust(hc, k = i, border = 'red')
    }
 })


 library(animation)
 ani.options(interval = 1)
 saveGIF({
    for (i in 1:8) {
        plot(-PC[, 1:2],
             col = cutree(hc, i),
             pch = as.numeric(factor(iris$Species)) + 5)
    }
 })


 ## #############################################################################
 ## basic decision trees
 ## #############################################################################

 rm(iris)
 set.seed(3)
 iris$rnd <- runif(150)
 iris <- iris[order(iris$rnd),]
 iris
 iris$rnd <- NULL

 train <- iris[1:100, ]
 test  <- iris[101:150, ]

 library(rpart)
 ct <- rpart(Species ~ ., data = train)
 summary(ct)

 plot(ct); text(ct)

 library(partykit)
 plot(as.party(ct))

 predict(ct, newdata = test)
 predict(ct, newdata = test, type = 'class')

 ## confusion matrix
 table(test$Species,
      predict(ct, newdata = test,
              type = 'class'))

 ct <- rpart(Species ~ ., data = train,
            minsplit = 3)
 ## ?rpart.control
 plot(ct); text(ct)

 table(test$Species,
      predict(ct, newdata = test,
              type = 'class'))

 ## #############################################################################
 ## K-Nearest Neighbors algorithm
 ## #############################################################################

 setDT(iris)
 iris[, rnd := runif(150)]
 setorder(iris, rnd)
 iris
 iris[, rnd := NULL]

 train <- iris[1:100]
 test  <- iris[101:150]

 library(class)
 knn(train[, 1:4, with = FALSE], test[, 1:4, with = FALSE], train$Species)

 res <- knn(train[, 1:4, with = FALSE], test[, 1:4, with = FALSE], train$Species)
 table(test$Species, res)

 res <- knn(train[, 1:4, with = FALSE], test[, 1:4, with = FALSE], train$Species, k = 2)
 table(test$Species, res)

 ## #############################################################################
 ## EXERCISE: basic decision trees
 ## #############################################################################
 ## => model gender from the height/weight dataset

 df <- read.csv('http://bit.ly/BudapestBI-R-csv')
 library(rpart)
 ct <- rpart(sex ~ heightIn + weightLb, data = df)
 plot(ct); text(ct)

 ct <- rpart(sex ~ heightIn + weightLb, data = df, minsplit = 1)
 table(df$sex, predict(ct, type = 'class'))

 ## #############################################################################
 ## did you use test data? beware of overfitting
 ## #############################################################################

 ## random order
 set.seed(7)
 df <- df[order(runif(nrow(df))), ]

 ## create training and test datasets
 train <- df[1:100, ]
 test  <- df[101:237, ]

 ## pretty impressing results!
 ct <- rpart(sex ~ heightIn + weightLb, data = train, minsplit = 1)
 table(train$sex, predict(ct, type = 'class'))

 ## but oops
 table(test$sex, predict(ct, newdata = test, type = 'class'))

 ## with a more generalized model
 ct <- rpart(sex ~ heightIn + weightLb, data = train)
 table(train$sex, predict(ct, type = 'class'))
 table(test$sex, predict(ct, newdata = test, type = 'class'))

 ## back to slides on high level overview on bagging, random forest, gbm etc

 ## #############################################################################
 ## H2O
 ## #############################################################################

 ## start and connect to H2O
 library(h2o)
 h2o.init()
 ## in a browser check http://localhost:54321

 ## demo data
 library(hflights)
 rm(hflights)
 str(hflights)

 ## copy to H2O
 hflights.hex <- as.h2o(hflights, 'hflights')

 str(hflights.hex)
 str(head(hflights.hex))
 head(hflights.hex, 3)
 summary(hflights.hex)
 ## check flight number: numeric?
 ## check on H2O web interface as well (enum vs factor)

 ## convert numeric to factor/enum
 hflights.hex[, 'FlightNum'] <- as.factor(hflights.hex[, 'FlightNum'])
 summary(hflights.hex)

 ## boring
 for (v in c('Month', 'DayofMonth', 'DayOfWeek', 'DepTime', 'ArrTime', 'Cancelled')) {
    hflights.hex[, v] <- as.factor(hflights.hex[, v])
 }

 summary(hflights.hex)

 ## feature engineering: departure time? is it OK? hour of the day?
 ## redo everything... just use the R script
 library(data.table)
 hflights <- data.table(hflights)
 hflights[, hour := substr(DepTime, 1, 2)]
 hflights[, .N, by = hour]

 hflights[, hour := substr(DepTime, 1, nchar(DepTime) - 2)]
 hflights[, hour := cut(as.numeric(hour), seq(0, 24, 4))]
 hflights[, .N, by = hour]
 hflights[is.na(hour)]

 hflights[, .N, by = .(is.na(hour), Cancelled == 1)]

 ## drop columns
 hflights <- hflights[, .(Month, DayofMonth, DayOfWeek, Dest, Origin,
                         UniqueCarrier, FlightNum, TailNum, Distance,
                         Cancelled = factor(Cancelled))]

 ## re-upload to H2O
 h2o.ls()
 hflights.hex <- as.h2o(as.data.frame(hflights), 'hflights')

 ## split the data
 h2o.splitFrame(data = hflights.hex , ratios = 0.75, destination_frames = paste0('hflights', 0:1))
 h2o.ls()

 ## build the first model
 hflights.rf <- h2o.randomForest(
    model_id = 'hflights_rf',
    x = setdiff(names(hflights), 'Cancelled'),
    y = 'Cancelled',
    training_frame = 'hflights0',
    validation_frame = 'hflights1')
 hflights.rf

 ## more trees
 hflights.rf <- h2o.randomForest(
    model_id = 'hflights_rf500',
    x = setdiff(names(hflights), 'Cancelled'),
    y = 'Cancelled',
    training_frame = 'hflights0',
    validation_frame = 'hflights1', ntrees = 500)

 ## change to UI and check ROC curve & AUC

 ## GBM
 hflights.gbm <- h2o.gbm(
    x = setdiff(names(hflights), 'Cancelled'),
    y = 'Cancelled',
    training_frame = 'hflights0',
    validation_frame = 'hflights1',
    model_id = 'hflights_gbm')

 ## more trees should help, again !!!
 hflights.gbm <- h2o.gbm(
    x = setdiff(names(hflights), 'Cancelled'),
    y = 'Cancelled',
    training_frame = 'hflights0',
    validation_frame = 'hflights1',
    model_id = 'hflights_gbm2', ntrees = 550)

 ## but no: although higher training AUC, lower validation AUC => overfit

 ## more trees should help, again !!!
 hflights.gbm <- h2o.gbm(
    x = setdiff(names(hflights), 'Cancelled'),
    y = 'Cancelled',
    training_frame = 'hflights_part0',
    validation_frame = 'hflights_part1',
    model_id = 'hflights_gbm2', ntrees = 250, learn_rate = 0.01)

 ## bye
 h2o.shutdown()

 ## #############################################################################
 ## intro to data.table
 ## #############################################################################

 ## introduction to data.table
 install.packages('data.table')
 install.packages('hflights')
 library(hflights)
 ?hflights

 str(hflights)
 library(data.table)
 dt <- data.table(hflights)

 ## dt[i]
 dt[DayOfWeek == 5]
 dt[DayOfWeek == 5 & Month == 3]
 dt[DayOfWeek == 5 & Month == 3, ActualElapsedTime]

 ## dt[i, j]
 dt[DayOfWeek == 5 & Month == 3, mean(ActualElapsedTime)]

 dt[DayOfWeek == 5 & Month == 3,
   mean(ActualElapsedTime, na.rm = TRUE)]

 dt[is.na(ActualElapsedTime) & Cancelled == 0 & Diverted == 0]

 ## the avg flight time to LAX
 dt[Dest == 'LAX', mean(AirTime, na.rm = TRUE)]

 ## avg flight time per dest
 ## dt[i, j, by = ...]
 dta <- dt[, mean(AirTime, na.rm = TRUE), by = Dest]
 dta
 names(dta)[2] <- 'avgAirTime'

 ## multiple "j" expresions
 dt[, .(
  minAvgTime = min(AirTime, na.rm = TRUE),
  avgAirTime = mean(AirTime, na.rm = TRUE),
  maxAirTime = max(AirTime, na.rm = TRUE)),
   by = .(Dest, Origin)]

 ## TODO number of cancelled flights to LAX
 dt[Dest == 'LAX', sum(Cancelled)]
 nrow(dt[Dest == 'LAX' & Cancelled == 1])
 dt[Dest == 'LAX' & Cancelled == 1, .N]

 dt[Cancelled == 1, .N, by = Dest]

 ## TODO sum of miles travelled to LAX
 dt[Dest == 'LAX', sum(Distance)]

 dt$distanceKm <- dt$Distance * 1.6
 dt[, distanceKm := Distance * 1.6]

 dt[Dest == 'LAX', sum(Distance) * 1.6]
 dt[Dest == 'LAX', sum(distanceKm)]

 ## TODO percentage of flights cancelled per dest
 dt[, sum(Cancelled == 1) / .N, by = Dest]
 dt[, sum(Cancelled) / .N, by = Dest]
 dt[, mean(Cancelled) * 100]

 ## TODO plot the departure and arrival delays
 ggplot(dt, aes(ArrDelay, DepDelay)) + geom_point()
 ggplot(dt, aes(ArrDelay, DepDelay)) + geom_hex()

 ## TODO plot the avg departure and arrival delay by dest:
 ##      aggregate, then visualize
 dta <- dt[, list(
  arrival = mean(ArrDelay, na.rm = TRUE),
  departure = mean(DepDelay, na.rm = TRUE)
 ), by = Dest]

 ggplot(dta, aes(x = departure, y = arrival)) +
  geom_text(aes(label = Dest))

 ## TODO visualize the avg flight time per dest
 dta <- dt[, list(airtime = mean(AirTime, na.rm = TRUE)),
   by = Dest]
 ggplot(dta, aes(x = Dest, y = airtime)) +
  geom_bar(stat = 'identity')

 ## heatmap => day of week + hour => N
 install.packages("nycflights13")
 library(nycflights13)
 str(flights)

 dt <- data.table(flights)
 dt[, dayOfWeek := weekdays(as.Date(
  paste(year, month, day, sep = '-')
 ))]

 dt[, date := paste(year, month, day, sep = '-')]
 dt[, date := as.Date(date)]
 dt[, dayOfWeek := weekdays(date)]

 dta <- dt[, list(count = .N), by = list(dayOfWeek, hour)]
 ggplot(dta, aes(x = hour, y = dayOfWeek, fill = count)) +
  geom_tile()

 ## long and wide tables
 library(reshape2)
 ?dcast
 ?melt
 dcast(dta, dayOfWeek ~ hour) # long -> wide
 dta[dayOfWeek == 'Friday' & hour == 5]

 dtw <- dcast(dta, hour ~ dayOfWeek)
 melt(dtw, id.vars = 'hour') # wide -> long
	## intro slides: http://bit.ly/CRUNCH-R-2017

	## basic operations
	1 + 3
	3*2
	3^2

	## constants
	pi
	"pi"
	'pi'
	letters
	LETTERS
	month.name
	month.abb

	## help
	?pi
	apropos('month')
	apropos('cor')

	## variables
	x = 4
	x <- 4
	x

	x / 2
	x^2
	x^0.5

	## functions
	sqrt(x)
	sqrt(x = x)
	sin(pi / 2)
	round(pi)
	ceiling(pi)
	floor(pi)

	## custom functions
	f <- function(x) 2 * x + 1
	f(5)
	f(pi)

	## vectors
	c(1, 2, 3, 4, 5)
	1:5
	seq(1, 5)
	?seq
	seq(1, 5, 0.1)

	x <- seq(1, 5, 0.1)
	plot(x, f(x))
	plot(x, f(x), type = 'l')

	plot(x, f(x), type = 'l', xlab = '', main = '2x+1')
	grid()


	## TODO plot 1 period of sine curve
	?sin
	f <- function(x) sin(x)

	x <- seq(0, pi * 2, 0.1)
	plot(x, sin(x), type = 'l')

	## a simpler way for plotting functions
	curve(f)
	?curve

	curve(2*x + 1, from = 0, to = 50)
	curve(x + 1, add = TRUE, col = 'red')

	## TODO plot 1 period of sine curve
	curve(sin, from = 0, to = 2*pi)

	## TODO Brownian motion in 1D
	runif(1)
	runif(5)

	## random numbers from uniform distribution
	round(runif(5))
	round(runif(5))*2 - 1

	cumsum(round(runif(25))*2 - 1)

	plot(cumsum(round(runif(25))*2 - 1), type = 's')

	set.seed(42)
	plot(cumsum(round(runif(25))*2 - 1), type = 's')

	for (i in 2:6) {
	lines(cumsum(round(runif(25))*2 - 1), type = 's', col = i)
	}

	## pro tipp #1
	plot(c(1, 25), c(-25, 25), type = 'n')
	plot(NULL, NULL, xlim = c(1, 25), ylim = c(-25, 25))

	## pro tipp #2
	sample(c(-1, 1), 25, replace = TRUE)

	################################################################################
	## custom vectors -> combine values into vector
	h <- c(174, 170, 160)
	h
	(w <- c(90, 80, 70))

	plot(h, w)
	plot(h, w, main = "Demo", xlab = 'Height', ylab = 'Weight')

	cor(w, h)

	lm(w ~ h) # formula notation
	fit <- lm(w ~ h)
	summary(fit)

	plot(h, w, main = "Demo", xlab = 'Height', ylab = 'Weight')
	abline(fit, col = 'red')

	## some basic modeling
	predict(fit, list(h = c(56, 104)))

	56 * fit$coefficients[2] + fit$coefficients[1]
	56 * 1.3462 + -146.1538

	## why?
	plot(h, w, main = "Demo", xlab = 'Height', ylab = 'Weight',
	xlim = c(0, max(h)),
	ylim = c(0, max(w)))
	abline(fit, col = 'red')

	## polynomial model
	fit <- lm(w ~ poly(h, 2, raw = TRUE))
	fit <- lm(w ~ h + I(h^2))
	fit

	2824 + 174 * -34.3571 + (174^2)*0.1071

	plot(h, w)
	lines(h, predict(fit), col = 'red')

	predict(fit, data.frame(h = c(56, 104)))

	plot(0:200, predict(fit, list(h = 0:200)), type = 'l')
	abline(v = c(160, 174), col = 'red')

	## data.frame
	df <- data.frame(weight = w, height = h)
	df
	str(df)
	df$weight
	?'$'

	plot(df)
	cor(df)

	lm(weight ~ height, data = df)

	str(df)
	df[i, j]
	df[1, ]
	df[, 1]
	df[3, 2]
	df$weight
	df$weight[2]

	## Body Mass Index (BMI) is a person's weight in kilograms divided by the square of height in meters.
	df$height <- df$height/100
	df
	df$bmi <- df$weight / df$height^2
	df
	df$bmi <- df$weight / (df$height/100)^2

	## import data
	df <- read.csv('http://bit.ly/heightweight')
	str(df)

	## TODO
	df$height <- df$heightIn * 2.54
	df$weight <- df$weightLb * 0.45
	str(df)

	df$heightIn <- df$weightLb <- NULL
	str(df)

	## descriptive stats
	min(df$ageYear)
	max(df$ageYear)
	range(df$ageYear)
	sum(df$weightLb)
	length(df$weight)
	nrow(df)
	ncol(df)
	dim(df)

	## TODO plot and analyze association between height and weight
	plot(df$height, df$weight)
	abline(lm(weight ~ height, data = df), col = 'red')

	## TODO compute and plot BMI index
	df$bmi <- df$weight / (df$height/100)^2
	hist(df$bmi)

	## exploratory data analysis
	pairs(df)

	library(GGally)
	ggpairs(df)

	library(pairsD3)
	pairsD3(df)

	## statistical test: t-test, ANOVA
	t.test(height ~ sex, data = df)

	aov(height ~ sex, data = df)
	summary(aov(height ~ sex, data = df))
	summary(aov(weight ~ sex, data = df))

	## Post hoc tests => Tukey Honest Significant Differences
	TukeyHSD(aov(height ~ sex, data = df))
	TukeyHSD(aov(weight ~ sex, data = df))

	## #############################################################################
	## Wilkinson: The Grammar of Graphics
	## Wickham: ggplot2 -> layer by layer:
	## - coordinate system,
	## - aesthetics (labels)
	## - scales
	## - geometries
	## - shapes
	## - statistics
	## - grid, color theme etc. + layers
	## http://www.cookbook-r.com/Graphs/
	## #############################################################################

	library(ggplot2)
	?diamonds

	## barplot
	ggplot(diamonds, aes(x = cut)) + geom_bar()
	ggplot(diamonds, aes(x = cut)) + geom_bar() + theme_bw()
	ggplot(diamonds, aes(x = cut)) +
	geom_bar(colour = "darkgreen", fill = "white") +
	theme_bw()

	p <- ggplot(diamonds, aes(x = cut)) +
	geom_bar(colour = "darkgreen", fill = "white") + theme_bw()
	p

	## coord transformations
	library(scales)
	p + scale_y_log10()
	p + scale_y_sqrt()
	p + scale_y_reverse()
	p + coord_flip()

	## facet
	p + facet_wrap( ~ clarity)
	p + facet_grid(color ~ clarity)

	## stacked, clustered bar charts
	p <- ggplot(diamonds, aes(x = color, fill = cut))
	p + geom_bar()
	p + geom_bar(position = "fill")
	p + geom_bar(position = "dodge")

	## histogram
	ggplot(diamonds, aes(x = price)) + geom_bar(binwidth = 100)
	ggplot(diamonds, aes(x = price)) + geom_bar(binwidth = 1000)
	ggplot(diamonds, aes(x = price)) + geom_bar(binwidth = 10000)

	ggplot(diamonds, aes(price, fill = cut)) + geom_density(alpha = 0.2) + theme_bw()

	## scatterplot
	p <- ggplot(diamonds, aes(x = carat, y = price)) + geom_point()

	## adding layers
	p
	p <- p + geom_smooth()
	p
	p + geom_smooth(method = "lm", se = FALSE, color = 'red')

	ggplot(diamonds, aes(x = carat, y = price, colour = cut)) + geom_point() + geom_smooth()


	## themes
	library(ggthemes)
	p + theme_economist() + scale_colour_economist()
	p + theme_stata() + scale_colour_stata()
	p + theme_excel() + scale_colour_excel()
	p + theme_wsj() + scale_colour_wsj("colors6", "")
	p + theme_gdocs() + scale_colour_gdocs()

	## Further reading at http://www.cookbook-r.com/Graphs/

	## TODO on mtcars
	## * number of carburators
	ggplot(mtcars, aes(x = carb)) + geom_bar()
	## * hp
	ggplot(mtcars, aes(x = hp)) + geom_histogram()
	## * barplot of carburators per am
	ggplot(mtcars, aes(x = carb)) + geom_bar() + facet_grid(. ~ am)
	## stacked bar chart
	ggplot(mtcars, aes(x = carb, fill = factor(am))) + geom_bar()
	## grouped/dodged
	ggplot(mtcars, aes(x = carb, fill = factor(am))) + geom_bar(position = 'dodge')
	## * boxplot of hp by carb
	ggplot(mtcars, aes(x = factor(am), y = hp)) + geom_boxplot()
	## * hp and wt by carb
	ggplot(mtcars, aes(x = hp, y = wt)) + geom_point() + facet_grid(. ~ carb)
	## * hp and wt by carb regression
	ggplot(mtcars, aes(x = hp, y = wt)) + geom_point() + facet_grid(. ~ carb) + geom_smooth()

	## #############################################################################
	## PCA demo on image processing
	## #############################################################################

	## http://www.nasa.gov/images/content/694811main_pia16225-43_full.jpg
	## http://bit.ly/BudapestBI-R-img
	download.file('http://bit.ly/BudapestBI-R-img', 'image.jpg')

	library(jpeg)
	img <- readJPEG('image.jpg')
	str(img)

	dim(img)
	h <- dim(img)[1]
	w <- dim(img)[2]
	h;w

	img1d <- matrix(img, h * w)
	str(img1d)

	pca <- prcomp(img1d)
	pca
	summary(pca)

	image(matrix(pca$x[, 1], h))
	image(matrix(pca$x[, 2], h))
	image(matrix(pca$x[, 2], h), col = gray.colors(100))

	image(matrix(pca$x[, 3], h))

	## #############################################################################
	## MDS
	## #############################################################################

	## distance between 40 Hungarian cities -> 2D scatterplot
	download.file('http://bit.ly/CEU-R-distances', 'cities.xls')

	library(readxl)
	cities <- read_excel('cities.xls')
	str(cities)

	## get rid of 1st column and last row (metadata)
	cities <- cities[, -1]
	cities <- cities[-nrow(cities), ]

	mds <- cmdscale(as.dist(cities))
	mds

	plot(mds)
	text(mds[, 1], mds[, 2], names(cities))

	## flipping both x and y axis
	mds <- -mds
	plot(mds)
	text(mds[, 1], mds[, 2], names(cities))

	## distance between European cities -> 2D scatterplot on ggplot

	mds <- cmdscale(eurodist)
	mds

	mds <- as.data.frame(mds)
	library(ggplot2)
	ggplot(mds, aes(V1, -V2, label = rownames(mds))) + geom_text()

	## non-geo stuff
	str(mtcars)
	?mtcars
	mds <- cmdscale(dist(mtcars))
	plot(mds)
	text(mds[, 1], mds[, 2], rownames(mtcars))

	mds <- as.data.frame(mds)
	library(ggplot2)
	ggplot(mds, aes(V1, -V2, label = rownames(mtcars))) +
	geom_text() + theme_bw()

	library(ggrepel)
	ggplot(mds, aes(V1, -V2, label = rownames(mtcars))) +
	geom_text_repel() + theme_bw()

	## #############################################################################
	## feature selection - same in 2D
	## #############################################################################

	str(iris)
	plot(iris$Sepal.Length, iris$Sepal.Width)

	fit <- lm(Sepal.Width ~ Sepal.Length, data = iris)
	fit
	summary(fit)

	plot(iris$Sepal.Length, iris$Sepal.Width)
	abline(fit, col = 'red', lwd = 5)

	## TODO improve the model
	summary(lm(Sepal.Width ~ Sepal.Length + Species,
	data = iris))

	library(ggplot2)
	ggplot(iris, aes(Sepal.Length, Sepal.Width,
	col = Species)) +
	geom_point() + geom_smooth(method = 'lm')

	ggplot(iris, aes(Sepal.Length, Sepal.Width)) +
	geom_point(aes(col = Species)) +
	geom_smooth(aes(col = Species), method = 'lm') +
	geom_smooth(method = 'lm', col = 'black') + theme_bw()

	## #############################################################################
	## PCA demo to show the steps in hierarchical clustering
	## #############################################################################

	PC <- prcomp(iris[, 1:4])$x
	dm <- dist(iris[, 1:4])
	hc <- hclust(dm)
	plot(hc)

	for (i in 2:8) {
	plot(hc)
	rect.hclust(hc, k = i, border = 'red')
	Sys.sleep(1)
	}

	library(animation)
	ani.options(interval = 1)
	saveGIF({
	for (i in 2:8) {
	plot(hc)
	rect.hclust(hc, k = i, border = 'red')
	}
	})


	library(animation)
	ani.options(interval = 1)
	saveGIF({
	for (i in 1:8) {
	plot(-PC[, 1:2],
	col = cutree(hc, i),
	pch = as.numeric(factor(iris$Species)) + 5)
	}
	})


	## #############################################################################
	## basic decision trees
	## #############################################################################

	rm(iris)
	set.seed(3)
	iris$rnd <- runif(150)
	iris <- iris[order(iris$rnd),]
	iris
	iris$rnd <- NULL

	train <- iris[1:100, ]
	test <- iris[101:150, ]

	library(rpart)
	ct <- rpart(Species ~ ., data = train)
	summary(ct)

	plot(ct); text(ct)

	library(partykit)
	plot(as.party(ct))

	predict(ct, newdata = test)
	predict(ct, newdata = test, type = 'class')

	## confusion matrix
	table(test$Species,
	predict(ct, newdata = test,
	type = 'class'))

	ct <- rpart(Species ~ ., data = train,
	minsplit = 3)
	## ?rpart.control
	plot(ct); text(ct)

	table(test$Species,
	predict(ct, newdata = test,
	type = 'class'))

	## #############################################################################
	## K-Nearest Neighbors algorithm
	## #############################################################################

	setDT(iris)
	iris[, rnd := runif(150)]
	setorder(iris, rnd)
	iris
	iris[, rnd := NULL]

	train <- iris[1:100]
	test <- iris[101:150]

	library(class)
	knn(train[, 1:4, with = FALSE], test[, 1:4, with = FALSE], train$Species)

	res <- knn(train[, 1:4, with = FALSE], test[, 1:4, with = FALSE], train$Species)
	table(test$Species, res)

	res <- knn(train[, 1:4, with = FALSE], test[, 1:4, with = FALSE], train$Species, k = 2)
	table(test$Species, res)

	## #############################################################################
	## EXERCISE: basic decision trees
	## #############################################################################
	## => model gender from the height/weight dataset

	df <- read.csv('http://bit.ly/BudapestBI-R-csv')
	library(rpart)
	ct <- rpart(sex ~ heightIn + weightLb, data = df)
	plot(ct); text(ct)

	ct <- rpart(sex ~ heightIn + weightLb, data = df, minsplit = 1)
	table(df$sex, predict(ct, type = 'class'))

	## #############################################################################
	## did you use test data? beware of overfitting
	## #############################################################################

	## random order
	set.seed(7)
	df <- df[order(runif(nrow(df))), ]

	## create training and test datasets
	train <- df[1:100, ]
	test <- df[101:237, ]

	## pretty impressing results!
	ct <- rpart(sex ~ heightIn + weightLb, data = train, minsplit = 1)
	table(train$sex, predict(ct, type = 'class'))

	## but oops
	table(test$sex, predict(ct, newdata = test, type = 'class'))

	## with a more generalized model
	ct <- rpart(sex ~ heightIn + weightLb, data = train)
	table(train$sex, predict(ct, type = 'class'))
	table(test$sex, predict(ct, newdata = test, type = 'class'))

	## back to slides on high level overview on bagging, random forest, gbm etc

	## #############################################################################
	## H2O
	## #############################################################################

	## start and connect to H2O
	library(h2o)
	h2o.init()
	## in a browser check http://localhost:54321

	## demo data
	library(hflights)
	rm(hflights)
	str(hflights)

	## copy to H2O
	hflights.hex <- as.h2o(hflights, 'hflights')

	str(hflights.hex)
	str(head(hflights.hex))
	head(hflights.hex, 3)
	summary(hflights.hex)
	## check flight number: numeric?
	## check on H2O web interface as well (enum vs factor)

	## convert numeric to factor/enum
	hflights.hex[, 'FlightNum'] <- as.factor(hflights.hex[, 'FlightNum'])
	summary(hflights.hex)

	## boring
	for (v in c('Month', 'DayofMonth', 'DayOfWeek', 'DepTime', 'ArrTime', 'Cancelled')) {
	hflights.hex[, v] <- as.factor(hflights.hex[, v])
	}

	summary(hflights.hex)

	## feature engineering: departure time? is it OK? hour of the day?
	## redo everything... just use the R script
	library(data.table)
	hflights <- data.table(hflights)
	hflights[, hour := substr(DepTime, 1, 2)]
	hflights[, .N, by = hour]

	hflights[, hour := substr(DepTime, 1, nchar(DepTime) - 2)]
	hflights[, hour := cut(as.numeric(hour), seq(0, 24, 4))]
	hflights[, .N, by = hour]
	hflights[is.na(hour)]

	hflights[, .N, by = .(is.na(hour), Cancelled == 1)]

	## drop columns
	hflights <- hflights[, .(Month, DayofMonth, DayOfWeek, Dest, Origin,
	UniqueCarrier, FlightNum, TailNum, Distance,
	Cancelled = factor(Cancelled))]

	## re-upload to H2O
	h2o.ls()
	hflights.hex <- as.h2o(as.data.frame(hflights), 'hflights')

	## split the data
	h2o.splitFrame(data = hflights.hex , ratios = 0.75, destination_frames = paste0('hflights', 0:1))
	h2o.ls()

	## build the first model
	hflights.rf <- h2o.randomForest(
	model_id = 'hflights_rf',
	x = setdiff(names(hflights), 'Cancelled'),
	y = 'Cancelled',
	training_frame = 'hflights0',
	validation_frame = 'hflights1')
	hflights.rf

	## more trees
	hflights.rf <- h2o.randomForest(
	model_id = 'hflights_rf500',
	x = setdiff(names(hflights), 'Cancelled'),
	y = 'Cancelled',
	training_frame = 'hflights0',
	validation_frame = 'hflights1', ntrees = 500)

	## change to UI and check ROC curve & AUC

	## GBM
	hflights.gbm <- h2o.gbm(
	x = setdiff(names(hflights), 'Cancelled'),
	y = 'Cancelled',
	training_frame = 'hflights0',
	validation_frame = 'hflights1',
	model_id = 'hflights_gbm')

	## more trees should help, again !!!
	hflights.gbm <- h2o.gbm(
	x = setdiff(names(hflights), 'Cancelled'),
	y = 'Cancelled',
	training_frame = 'hflights0',
	validation_frame = 'hflights1',
	model_id = 'hflights_gbm2', ntrees = 550)

	## but no: although higher training AUC, lower validation AUC => overfit

	## more trees should help, again !!!
	hflights.gbm <- h2o.gbm(
	x = setdiff(names(hflights), 'Cancelled'),
	y = 'Cancelled',
	training_frame = 'hflights_part0',
	validation_frame = 'hflights_part1',
	model_id = 'hflights_gbm2', ntrees = 250, learn_rate = 0.01)

	## bye
	h2o.shutdown()

	## #############################################################################
	## intro to data.table
	## #############################################################################

	## introduction to data.table
	install.packages('data.table')
	install.packages('hflights')
	library(hflights)
	?hflights

	str(hflights)
	library(data.table)
	dt <- data.table(hflights)

	## dt[i]
	dt[DayOfWeek == 5]
	dt[DayOfWeek == 5 & Month == 3]
	dt[DayOfWeek == 5 & Month == 3, ActualElapsedTime]

	## dt[i, j]
	dt[DayOfWeek == 5 & Month == 3, mean(ActualElapsedTime)]

	dt[DayOfWeek == 5 & Month == 3,
	mean(ActualElapsedTime, na.rm = TRUE)]

	dt[is.na(ActualElapsedTime) & Cancelled == 0 & Diverted == 0]

	## the avg flight time to LAX
	dt[Dest == 'LAX', mean(AirTime, na.rm = TRUE)]

	## avg flight time per dest
	## dt[i, j, by = ...]
	dta <- dt[, mean(AirTime, na.rm = TRUE), by = Dest]
	dta
	names(dta)[2] <- 'avgAirTime'

	## multiple "j" expresions
	dt[, .(
	minAvgTime = min(AirTime, na.rm = TRUE),
	avgAirTime = mean(AirTime, na.rm = TRUE),
	maxAirTime = max(AirTime, na.rm = TRUE)),
	by = .(Dest, Origin)]

	## TODO number of cancelled flights to LAX
	dt[Dest == 'LAX', sum(Cancelled)]
	nrow(dt[Dest == 'LAX' & Cancelled == 1])
	dt[Dest == 'LAX' & Cancelled == 1, .N]

	dt[Cancelled == 1, .N, by = Dest]

	## TODO sum of miles travelled to LAX
	dt[Dest == 'LAX', sum(Distance)]

	dt$distanceKm <- dt$Distance * 1.6
	dt[, distanceKm := Distance * 1.6]

	dt[Dest == 'LAX', sum(Distance) * 1.6]
	dt[Dest == 'LAX', sum(distanceKm)]

	## TODO percentage of flights cancelled per dest
	dt[, sum(Cancelled == 1) / .N, by = Dest]
	dt[, sum(Cancelled) / .N, by = Dest]
	dt[, mean(Cancelled) * 100]

	## TODO plot the departure and arrival delays
	ggplot(dt, aes(ArrDelay, DepDelay)) + geom_point()
	ggplot(dt, aes(ArrDelay, DepDelay)) + geom_hex()

	## TODO plot the avg departure and arrival delay by dest:
	## aggregate, then visualize
	dta <- dt[, list(
	arrival = mean(ArrDelay, na.rm = TRUE),
	departure = mean(DepDelay, na.rm = TRUE)
	), by = Dest]

	ggplot(dta, aes(x = departure, y = arrival)) +
	geom_text(aes(label = Dest))

	## TODO visualize the avg flight time per dest
	dta <- dt[, list(airtime = mean(AirTime, na.rm = TRUE)),
	by = Dest]
	ggplot(dta, aes(x = Dest, y = airtime)) +
	geom_bar(stat = 'identity')

	## heatmap => day of week + hour => N
	install.packages("nycflights13")
	library(nycflights13)
	str(flights)

	dt <- data.table(flights)
	dt[, dayOfWeek := weekdays(as.Date(
	paste(year, month, day, sep = '-')
	))]

	dt[, date := paste(year, month, day, sep = '-')]
	dt[, date := as.Date(date)]
	dt[, dayOfWeek := weekdays(date)]

	dta <- dt[, list(count = .N), by = list(dayOfWeek, hour)]
	ggplot(dta, aes(x = hour, y = dayOfWeek, fill = count)) +
	geom_tile()

	## long and wide tables
	library(reshape2)
	?dcast
	?melt
	dcast(dta, dayOfWeek ~ hour) # long -> wide
	dta[dayOfWeek == 'Friday' & hour == 5]

	dtw <- dcast(dta, hour ~ dayOfWeek)
	melt(dtw, id.vars = 'hour') # wide -> long