Created
April 11, 2019 11:40
-
-
Save daroczig/303e78265beca1c60697506541cf1111 to your computer and use it in GitHub Desktop.
Visual Methods to Teach Multivariate Statistics with R (at) Challenges and Innovations in Statistics Education (Szeged, Hungary)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
## intro slides: http://bit.ly/r-intro-slide | |
## basic operations | |
1 + 3 | |
3*2 | |
3^2 | |
## constants | |
pi | |
"pi" | |
'pi' | |
letters | |
LETTERS | |
month.name | |
month.abb | |
## help | |
?pi | |
apropos('month') | |
apropos('cor') | |
## variables | |
x = 4 | |
x <- 4 | |
x | |
x / 2 | |
x^2 | |
x^0.5 | |
## functions | |
sqrt(x) | |
sqrt(x = x) | |
sin(pi / 2) | |
round(pi) | |
ceiling(pi) | |
floor(pi) | |
## custom functions | |
f <- function(x) 2 * x + 1 | |
f(5) | |
f(pi) | |
## vectors | |
c(1, 2, 3, 4, 5) | |
1:5 | |
seq(1, 5) | |
?seq | |
seq(1, 5, 0.1) | |
x <- seq(1, 5, 0.1) | |
plot(x, f(x)) | |
plot(x, f(x), type = 'l') | |
plot(x, f(x), type = 'l', xlab = '', main = '2x+1') | |
grid() | |
## TODO plot 1 period of sine curve | |
?sin | |
f <- function(x) sin(x) | |
{ | |
x <- seq(0, pi * 2, 0.1) | |
plot(x, sin(x), type = 'l') | |
} | |
## a simpler way for plotting functions | |
curve(f) | |
?curve | |
curve(2*x + 1, from = 0, to = 50) | |
curve(x + 1, add = TRUE, col = 'red') | |
## TODO plot 1 period of sine curve | |
{ | |
curve(sin, from = 0, to = 2*pi) | |
} | |
## TODO Brownian motion in 1D | |
runif(1) | |
runif(5) | |
{ | |
## random numbers from uniform distribution | |
round(runif(5)) | |
round(runif(5))*2 - 1 | |
cumsum(round(runif(25))*2 - 1) | |
plot(cumsum(round(runif(25))*2 - 1), type = 's') | |
set.seed(42) | |
plot(cumsum(round(runif(25))*2 - 1), type = 's') | |
for (i in 2:6) { | |
lines(cumsum(round(runif(25))*2 - 1), type = 's', col = i) | |
} | |
## pro tipp #1 | |
plot(c(1, 25), c(-25, 25), type = 'n') | |
plot(NULL, NULL, xlim = c(1, 25), ylim = c(-25, 25)) | |
## pro tipp #2 | |
sample(c(-1, 1), 25, replace = TRUE) | |
} | |
################################################################################ | |
## custom vectors -> combine values into vector | |
h <- c(174, 170, 160) | |
h | |
(w <- c(90, 80, 70)) | |
plot(h, w) | |
plot(h, w, main = "Demo", xlab = 'Height', ylab = 'Weight') | |
cor(w, h) | |
lm(w ~ h) # formula notation | |
fit <- lm(w ~ h) | |
summary(fit) | |
plot(h, w, main = "Demo", xlab = 'Height', ylab = 'Weight') | |
abline(fit, col = 'red') | |
## some basic modeling | |
predict(fit, list(h = c(56, 104))) | |
56 * fit$coefficients[2] + fit$coefficients[1] | |
56 * 1.3462 + -146.1538 | |
## why? | |
{ | |
plot(h, w, main = "Demo", xlab = 'Height', ylab = 'Weight', | |
xlim = c(0, max(h)), | |
ylim = c(0, max(w))) | |
abline(fit, col = 'red') | |
} | |
## polynomial model | |
fit <- lm(w ~ poly(h, 2, raw = TRUE)) | |
fit <- lm(w ~ h + I(h^2)) | |
fit | |
2824 + 174 * -34.3571 + (174^2)*0.1071 | |
plot(h, w) | |
lines(h, predict(fit), col = 'red') | |
predict(fit, data.frame(h = c(56, 104))) | |
plot(0:200, predict(fit, list(h = 0:200)), type = 'l') | |
abline(v = c(160, 174), col = 'red') | |
## data.frame | |
df <- data.frame(weight = w, height = h) | |
df | |
str(df) | |
df$weight | |
?'$' | |
plot(df) | |
cor(df) | |
lm(weight ~ height, data = df) | |
str(df) | |
df[i, j] | |
df[1, ] | |
df[, 1] | |
df[3, 2] | |
df$weight | |
df$weight[2] | |
## Body Mass Index (BMI) is a person's weight in kilograms divided by the square of height in meters. | |
df$height <- df$height/100 | |
df | |
df$bmi <- df$weight / df$height^2 | |
df | |
df$bmi <- df$weight / (df$height/100)^2 | |
## import data | |
df <- read.csv('http://bit.ly/heightweight') | |
str(df) | |
## TODO | |
df$height <- df$heightIn * 2.54 | |
df$weight <- df$weightLb * 0.45 | |
str(df) | |
df$heightIn <- df$weightLb <- NULL | |
str(df) | |
## descriptive stats | |
min(df$ageYear) | |
max(df$ageYear) | |
range(df$ageYear) | |
sum(df$weightLb) | |
length(df$weight) | |
nrow(df) | |
ncol(df) | |
dim(df) | |
## TODO plot and analyze association between height and weight | |
{ | |
plot(df$height, df$weight) | |
abline(lm(weight ~ height, data = df), col = 'red') | |
} | |
## TODO compute and plot BMI index | |
{ | |
df$bmi <- df$weight / (df$height/100)^2 | |
hist(df$bmi) | |
} | |
## exploratory data analysis | |
pairs(df) | |
library(GGally) | |
ggpairs(df) | |
library(pairsD3) | |
pairsD3(df) | |
## statistical test: t-test, ANOVA | |
t.test(height ~ sex, data = df) | |
aov(height ~ sex, data = df) | |
summary(aov(height ~ sex, data = df)) | |
summary(aov(weight ~ sex, data = df)) | |
## Post hoc tests => Tukey Honest Significant Differences | |
TukeyHSD(aov(height ~ sex, data = df)) | |
TukeyHSD(aov(weight ~ sex, data = df)) | |
## ############################################################################# | |
## confounding variables | |
## ############################################################################# | |
library(foreign) | |
download.file('http://bit.ly/math_and_shoes', 'math.csv', method = 'curl', extra = '-L') | |
df <- read.csv('math.csv') | |
df$id <- NULL | |
plot(df$size, df$math) | |
summary(lm(math ~ size, df)) | |
plot(df$x, df$math) | |
plot(df$x, df$size) | |
## 3D scatterplot with plane | |
library(scatterplot3d) | |
p <- scatterplot3d(df[, c('size', 'x', 'math')]) | |
fit <- lm(math ~ size + x, df) | |
p$plane3d(fit) | |
## interactive 3D scatterplot with plane | |
library(rgl) | |
plot3d(df$x, df$size, df$math, col = 'red', size = 3) | |
planes3d(5.561, 4.563, -1, -178.496, col = 'orange') | |
cor(df) | |
residuals(lm(math ~ x, df)) | |
residuals(lm(size ~ x, df)) | |
cor(residuals(lm(math ~ x, df)), residuals(lm(size ~ x, df))) | |
library(psych) | |
partial.r(df, 1:2, 3) | |
## ############################################################################# | |
## feature selection - same in 2D | |
## ############################################################################# | |
str(iris) | |
plot(iris$Sepal.Length, iris$Sepal.Width) | |
fit <- lm(Sepal.Width ~ Sepal.Length, data = iris) | |
fit | |
summary(fit) | |
plot(iris$Sepal.Length, iris$Sepal.Width) | |
abline(fit, col = 'red', lwd = 5) | |
## TODO improve the model | |
{ | |
summary(lm(Sepal.Width ~ Sepal.Length + Species, | |
data = iris)) | |
library(ggplot2) | |
ggplot(iris, aes(Sepal.Length, Sepal.Width, | |
col = Species)) + | |
geom_point() + geom_smooth(method = 'lm') | |
ggplot(iris, aes(Sepal.Length, Sepal.Width)) + | |
geom_point(aes(col = Species)) + | |
geom_smooth(aes(col = Species), method = 'lm') + | |
geom_smooth(method = 'lm', col = 'black') + theme_bw() | |
} | |
## ############################################################################# | |
## Wilkinson: The Grammar of Graphics | |
## Wickham: ggplot2 -> layer by layer: coordinate system, aesthetics (labels), scales, geometries, shapes, statistics, grid, color theme etc. + layers | |
## http://www.cookbook-r.com/Graphs/ | |
## ############################################################################# | |
library(ggplot2) | |
?diamonds | |
{ | |
## barplot | |
ggplot(diamonds, aes(x = cut)) + geom_bar() | |
ggplot(diamonds, aes(x = cut)) + geom_bar() + theme_bw() | |
ggplot(diamonds, aes(x = cut)) + geom_bar(colour = "darkgreen", fill = "white") + theme_bw() | |
p <- ggplot(diamonds, aes(x = cut)) + | |
geom_bar(colour = "darkgreen", fill = "white") + theme_bw() | |
p | |
## coord transformations | |
library(scales) | |
p + scale_y_log10() | |
p + scale_y_sqrt() | |
p + scale_y_reverse() | |
p + coord_flip() | |
## facet | |
p + facet_wrap( ~ clarity) | |
p + facet_grid(color ~ clarity) | |
## stacked, clustered bar charts | |
p <- ggplot(diamonds, aes(x = color, fill = cut)) | |
p + geom_bar() | |
p + geom_bar(position = "fill") | |
p + geom_bar(position = "dodge") | |
## histogram | |
ggplot(diamonds, aes(x = price)) + geom_bar(binwidth = 100) | |
ggplot(diamonds, aes(x = price)) + geom_bar(binwidth = 1000) | |
ggplot(diamonds, aes(x = price)) + geom_bar(binwidth = 10000) | |
ggplot(diamonds, aes(price, fill = cut)) + geom_density(alpha = 0.2) + theme_bw() | |
## scatterplot | |
p <- ggplot(diamonds, aes(x = carat, y = price)) + geom_point() | |
## adding layers | |
p | |
p <- p + geom_smooth() | |
p | |
p + geom_smooth(method = "lm", se = FALSE, color = 'red') | |
ggplot(diamonds, aes(x = carat, y = price, colour = cut)) + geom_point() + geom_smooth() | |
## themes | |
library(ggthemes) | |
p + theme_economist() + scale_colour_economist() | |
p + theme_stata() + scale_colour_stata() | |
p + theme_excel() + scale_colour_excel() | |
p + theme_wsj() + scale_colour_wsj("colors6", "") | |
p + theme_gdocs() + scale_colour_gdocs() | |
## Further reading at http://www.cookbook-r.com/Graphs/ | |
## TODO on mtcars | |
## * number of carburators | |
ggplot(mtcars, aes(x = carb)) + geom_bar() | |
## * hp | |
ggplot(mtcars, aes(x = hp)) + geom_histogram() | |
## * barplot of carburators per am | |
ggplot(mtcars, aes(x = carb)) + geom_bar() + facet_grid(. ~ am) | |
## stacked bar chart | |
ggplot(mtcars, aes(x = carb, fill = factor(am))) + geom_bar() | |
## grouped/dodged | |
ggplot(mtcars, aes(x = carb, fill = factor(am))) + geom_bar(position = 'dodge') | |
## * boxplot of hp by carb | |
ggplot(mtcars, aes(x = factor(am), y = hp)) + geom_boxplot() | |
## * hp and wt by carb | |
ggplot(mtcars, aes(x = hp, y = wt)) + geom_point() + facet_grid(. ~ carb) | |
## * hp and wt by carb regression | |
ggplot(mtcars, aes(x = hp, y = wt)) + geom_point() + facet_grid(. ~ carb) + geom_smooth() | |
} | |
## ############################################################################# | |
## PCA demo on image processing | |
## ############################################################################# | |
setwd('/home/daroczig/projects/teachR/bce/') | |
## http://www.nasa.gov/images/content/694811main_pia16225-43_full.jpg | |
## http://bit.ly/BudapestBI-R-img | |
## download.file('http://bit.ly/BudapestBI-R-img', 'image.jpg', method = 'curl', extra = '-L') | |
library(jpeg) | |
img <- readJPEG('image.jpg') | |
str(img) | |
dim(img) | |
h <- dim(img)[1] | |
w <- dim(img)[2] | |
h;w | |
img1d <- matrix(img, h * w) | |
str(img1d) | |
pca <- prcomp(img1d) | |
pca | |
summary(pca) | |
image(matrix(pca$x[, 1], h)) | |
image(matrix(pca$x[, 2], h)) | |
image(matrix(pca$x[, 2], h), col = gray.colors(100)) | |
image(matrix(pca$x[, 3], h)) | |
## ############################################################################# | |
## MDS | |
## ############################################################################# | |
library(readxl) | |
cities <- read_excel('/home/daroczig/projects/teachR/bce/psoft-telepules-matrix-30000.xls') | |
str(cities) | |
mds <- cmdscale(as.dist(cities[-nrow(cities), -1])) | |
mds | |
plot(mds) | |
text(mds[, 1], mds[, 2], tail(names(cities), -1)) | |
mds <- -mds | |
plot(mds) | |
text(mds[, 1], mds[, 2], tail(names(cities), -1)) | |
## non-geo stuff | |
str(mtcars) | |
?mtcars | |
mds <- cmdscale(dist(mtcars)) | |
plot(mds) | |
text(mds[, 1], mds[, 2], rownames(mtcars)) | |
mds <- as.data.frame(mds) | |
library(ggplot2) | |
ggplot(mds, aes(V1, -V2, label = rownames(mtcars))) + | |
geom_text() + theme_bw() | |
library(ggrepel) | |
ggplot(mds, aes(V1, -V2, label = rownames(mtcars))) + | |
geom_text_repel() + theme_bw() | |
## ############################################################################# | |
## PCA demo to show the steps in hierarchical clustering | |
## ############################################################################# | |
PC <- prcomp(iris[, 1:4])$x | |
dm <- dist(iris[, 1:4]) | |
hc <- hclust(dm) | |
plot(hc) | |
for (i in 2:8) { | |
plot(hc) | |
rect.hclust(hc, k = i, border = 'red') | |
Sys.sleep(1) | |
} | |
library(animation) | |
ani.options(interval = 1) | |
saveGIF({ | |
for (i in 2:8) { | |
plot(hc) | |
rect.hclust(hc, k = i, border = 'red') | |
} | |
}) | |
library(animation) | |
ani.options(interval = 1) | |
saveGIF({ | |
for (i in 1:8) { | |
plot(-PC[, 1:2], | |
col = cutree(hc, i), | |
pch = as.numeric(factor(iris$Species)) + 5) | |
} | |
}) | |
## ############################################################################# | |
## basic decision trees | |
## ############################################################################# | |
rm(iris) | |
set.seed(3) | |
iris$rnd <- runif(150) | |
iris <- iris[order(iris$rnd),] | |
iris | |
iris$rnd <- NULL | |
train <- iris[1:100, ] | |
test <- iris[101:150, ] | |
library(rpart) | |
ct <- rpart(Species ~ ., data = train) | |
summary(ct) | |
plot(ct); text(ct) | |
library(partykit) | |
plot(as.party(ct)) | |
predict(ct, newdata = test) | |
predict(ct, newdata = test, type = 'class') | |
## confusion matrix | |
table(test$Species, | |
predict(ct, newdata = test, | |
type = 'class')) | |
ct <- rpart(Species ~ ., data = train, | |
minsplit = 3) | |
## ?rpart.control | |
plot(ct); text(ct) | |
table(test$Species, | |
predict(ct, newdata = test, | |
type = 'class')) | |
## ############################################################################# | |
## K-Nearest Neighbors algorithm | |
## ############################################################################# | |
setDT(iris) | |
iris[, rnd := runif(150)] | |
setorder(iris, rnd) | |
iris | |
iris[, rnd := NULL] | |
train <- iris[1:100] | |
test <- iris[101:150] | |
library(class) | |
knn(train[, 1:4, with = FALSE], test[, 1:4, with = FALSE], train$Species) | |
res <- knn(train[, 1:4, with = FALSE], test[, 1:4, with = FALSE], train$Species) | |
table(test$Species, res) | |
res <- knn(train[, 1:4, with = FALSE], test[, 1:4, with = FALSE], train$Species, k = 2) | |
table(test$Species, res) | |
## ############################################################################# | |
## EXERCISE: basic decision trees | |
## ############################################################################# | |
## => model gender from the height/weight dataset | |
df <- read.csv('http://bit.ly/BudapestBI-R-csv') | |
library(rpart) | |
ct <- rpart(sex ~ heightIn + weightLb, data = df) | |
plot(ct); text(ct) | |
ct <- rpart(sex ~ heightIn + weightLb, data = df, minsplit = 1) | |
table(df$sex, predict(ct, type = 'class')) | |
## ############################################################################# | |
## did you use test data? beware of overfitting | |
## ############################################################################# | |
## random order | |
set.seed(7) | |
df <- df[order(runif(nrow(df))), ] | |
## create training and test datasets | |
train <- df[1:100, ] | |
test <- df[101:237, ] | |
## pretty impressing results! | |
ct <- rpart(sex ~ heightIn + weightLb, data = train, minsplit = 1) | |
table(train$sex, predict(ct, type = 'class')) | |
## but oops | |
table(test$sex, predict(ct, newdata = test, type = 'class')) | |
## with a more generalized model | |
ct <- rpart(sex ~ heightIn + weightLb, data = train) | |
table(train$sex, predict(ct, type = 'class')) | |
table(test$sex, predict(ct, newdata = test, type = 'class')) | |
## back to slides on high level overview on bagging, random forest, gbm etc | |
## ############################################################################# | |
## H2O => install H2O and while we wait: http://bit.ly/CEU-R-boosting | |
## ############################################################################# | |
## first real analysis -> write a script | |
## start and connect to H2O | |
library(h2o) | |
h2o.init() | |
## http://localhost:54321 | |
## demo data | |
library(hflights) | |
rm(hflights) | |
str(hflights) | |
## copy to H2O | |
hflights.hex <- as.h2o(hflights, 'hflights') | |
str(hflights.hex) | |
str(hhead(hflights.hex) | |
head(hflights.hex, 3) | |
summary(hflights.hex) | |
## flight number???? | |
## go to H2O web interface | |
## convert numeric to factor/enum | |
hflights.hex[, 'FlightNum'] <- as.factor(hflights.hex[, 'FlightNum']) | |
summary(hflights.hex) | |
## boring | |
## LEAVE OUT CANCELLED FOR EXERCISE | |
for (v in c('Month', 'DayofMonth', 'DayOfWeek', 'DepTime', 'ArrTime', 'Cancelled')) { | |
hflights.hex[, v] <- as.factor(hflights.hex[, v]) | |
} | |
summary(hflights.hex) | |
## feature engineering: departure time? is it OK? hour of the day? | |
## redo everything... just use the R script | |
library(data.table) | |
hflights <- data.table(hflights) | |
hflights[, hour := substr(DepTime, 1, 2)] | |
hflights[, .N, by = hour] | |
hflights[, hour := substr(DepTime, 1, nchar(DepTime) - 2)] | |
hflights[, hour := cut(as.numeric(hour), seq(0, 24, 4))] | |
hflights[, .N, by = hour] | |
hflights[is.na(hour)] | |
hflights[, .N, by = .(is.na(hour), Cancelled == 1)] | |
## drop columns | |
hflights <- hflights[, .(Month, DayofMonth, DayOfWeek, Dest, Origin, | |
UniqueCarrier, FlightNum, TailNum, Distance, | |
Cancelled = factor(Cancelled))] | |
## re-upload to H2O | |
h2o.ls() | |
hflights.hex <- as.h2o(as.data.frame(hflights), 'hflights') | |
## split the data | |
h2o.splitFrame(data = hflights.hex , ratios = 0.75, destination_frames = paste0('hflights', 0:1)) | |
h2o.ls() | |
## build the first model | |
hflights.rf <- h2o.randomForest( | |
model_id = 'hflights_rf', | |
x = setdiff(names(hflights), 'Cancelled'), | |
y = 'Cancelled', | |
training_frame = 'hflights0', | |
validation_frame = 'hflights1') | |
hflights.rf | |
## more trees | |
hflights.rf <- h2o.randomForest( | |
model_id = 'hflights_rf500', | |
x = setdiff(names(hflights), 'Cancelled'), | |
y = 'Cancelled', | |
training_frame = 'hflights0', | |
validation_frame = 'hflights1', ntrees = 500) | |
## change to UI and check ROC curve & AUC | |
## GBM | |
hflights.gbm <- h2o.gbm( | |
x = setdiff(names(hflights), 'Cancelled'), | |
y = 'Cancelled', | |
training_frame = 'hflights0', | |
validation_frame = 'hflights1', | |
model_id = 'hflights_gbm') | |
## more trees should help, again !!! | |
hflights.gbm <- h2o.gbm( | |
x = setdiff(names(hflights), 'Cancelled'), | |
y = 'Cancelled', | |
training_frame = 'hflights0', | |
validation_frame = 'hflights1', | |
model_id = 'hflights_gbm2', ntrees = 550) | |
## but no: although higher training AUC, lower validation AUC => overfit | |
## more trees should help, again !!! | |
hflights.gbm <- h2o.gbm( | |
x = setdiff(names(hflights), 'Cancelled'), | |
y = 'Cancelled', | |
training_frame = 'hflights_part0', | |
validation_frame = 'hflights_part1', | |
model_id = 'hflights_gbm2', ntrees = 250, learn_rate = 0.01) | |
## bye | |
h2o.shutdown() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment