-
-
Save josephlei/5dabfc7e9b412b5eb76d003ebc1a97cb to your computer and use it in GitHub Desktop.
Adventures in R Training Code
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
######################################### | |
# Lesson 1 - R Basics | |
# | |
# Learning Objectives | |
# 1. R Data Types | |
# 2. Indexing | |
# 3. Boolean Logic and Filtering | |
# 4. Importing/Exporting | |
# 5. The R Environment | |
######################################### | |
# The Vector ---------------------------- | |
c(1, 2, 3) # Combines elements into a Vector | |
seq(1, 3) # Sequence function | |
seq(3) # 1 parameter version | |
1:3 # sequence shortcut | |
1-2:3 # Unexpected consequences | |
(1-2):3 # Order of Operations | |
seq(1-2, 3) # May be easier to use function | |
sum(1:10000) # Vectorized functions | |
s # Error "object 's' not found | |
sum(s) # Ditto | |
s <- "sentence" # A string | |
s | |
s <- strsplit(s, "")[[1]] # Split it apart | |
s | |
class(s) # Type of Object | |
mode(s) # Type of Storage Mode | |
length(s) # How big is this Vector | |
sum(s) # Errors when incompatible types (class) | |
########################################## | |
# Knowledge Check 1 | |
# 1. Make a vector. | |
# Yes, that's it. Make a vector. | |
# Any vector. Assign it to a varaible. | |
# Look at its class, mode, and length | |
########################################## | |
# Named Vectors and Indexing --------------------------------------------- | |
c("Al" = 30, "Beth" = 24, "Charlie" = 40) | |
names(s) # No Name Attribute | |
NULL | |
NA | |
c(1, NA, 3) # NA is Missing | |
c(1, NULL, 3) # NULL is VOID | |
1:length(s) # Numeric Integer Sequence | |
1:(length(s) - 2) # Data-driven design | |
LETTERS # Constant - Uppercase Letters | |
letters # Constant - Lowercase Letters | |
TRUE # Constant - Boolean True | |
F # Constant - Abbreviated FALSE | |
pi # Constant - PI | |
month.name # Constant - Month Names | |
month.abb # Constant - Month Abbreviations | |
month.abb[3] # Numeric Point Indexing | |
month.abb[1:3] # Range Index | |
month.abb[c(1, 6, 12)] # Discontinuous Range Index | |
month.abb[-1] # Negative Index | |
names(s) <- month.abb[1:length(s)] | |
s | |
names(s) | |
s["Jan"] # Named Indexing | |
s[c("Jan", "Not Here")] | |
s[100] | |
######################################################## | |
# Knowledge Check 2 | |
# 1. Create a vector representing your name | |
# [1] "b" "r" "y" "a" "n" | |
# 2. Index this vector to return the *last* 3 elements | |
# [1] "y" "a" "n" | |
######################################################## | |
# Dimensional Vectors - The Matrix ---------------------------------------- | |
s <- sample(1:100, size = 8) | |
s | |
dim(s) | |
dim(s) <- c(4, 2) | |
s | |
class(s) | |
mode(s) | |
length(s) | |
nrow(s) | |
ncol(s) | |
names(s) <- c("X", "Y") | |
s # WHAT HAPPENED?! | |
attributes(s) | |
names(s) | |
attr(s, "names") | |
names(s) <- NULL | |
s | |
dimnames(s) | |
colnames(s) <- c("X", "Y") | |
s | |
dimnames(s) | |
rownames(s) <- 1:nrow(s) | |
s | |
dimnames(s) | |
names(dimnames(s)) | |
names(dimnames(s)) <- c("A", "B") | |
dimnames(s) | |
s | |
###################################################### | |
# Knowledge Check 3 (Homework) | |
# 1. Create a random 20-element numeric vector | |
# 2. Coerce it into a 5x4 matrix | |
# 3. Label the rows Y2001 through Y2005 | |
# 4. Label the columns Q1 through Q4 | |
# 5. Name the dimnames Year and Quarter, respectively | |
###################################################### | |
# Dimensional Indexing ---------------------------------------------------- | |
s[1, 1] # Matrix Point index | |
s[, 1] # Column Index - Returns Vector (lower dim) | |
s[, 1, drop = FALSE] # Maintain Object Structure | |
s[2:3, ] # Row Ranged Index | |
s[, "Y"] # Named Column Index | |
s["4", ] # Named Row Index | |
s[4, ,drop = FALSE] # Row Index with Structure | |
s[, -2, drop = FALSE] # Negative Index | |
s[1, 1] <- 999 | |
s | |
s[length(s)] <- 222 | |
s | |
####################################### | |
# Knowledge Check 4 | |
# 1. Use negative indexing to print the | |
# matrix without the first 2 rows. | |
# 2. Replace the bottom row (using nrow) | |
# to assign new values to each column | |
####################################### | |
# Too-Many-Dimensions Vectors - Arrays ----------------------------------- | |
s = 1:30 | |
class(s) | |
mode(s) | |
dim(s) <- c(5, 2, 3) | |
s | |
class(s) | |
mode(s) | |
length(s) | |
s[20] <- 100 # Vector Point Index Assignment | |
s | |
s[5, 2, 2] # Array Point Index | |
s[5, 2, 1:2] # Mixed Indexing | |
s[5, 2, 1:2, drop = FALSE] # Keep Structure | |
nrow(s) # Same as dim(s)[1] | |
ncol(s) # Same as dim(s)[2] | |
dim(s)[3] # No more helpers | |
attr(s, "dim")[3] # For the hardcore programmer | |
dimnames(s) | |
dimnames(s) <- list( | |
"Rows" = 1:5, | |
"Fields" = sample(LETTERS, 2), | |
"Group" = c("Ones", "Tens", "Twenties")) | |
s | |
s[, , "Ones"] # Named Index | |
s[2:3, 2, c(1, 3)] # Mixed Index | |
class(s[2:3, 2, c(1, 3)]) | |
s[2:3, 2, c(1, 3), drop = FALSE] | |
############################################## | |
# Knowedge Check 5 (Homework) | |
# 1. Read the ?matrix help documentation | |
# (Recommend also ?vector and ?array) | |
# 2. Explore creating matrices from a vector | |
# setting the byrow parameter both to | |
# TRUE and then to FALSE. | |
# 3. Execute x <- rnorm(20)^2 * 100 to | |
# represent a random time series data set | |
# of quarterly product earnings over 2001 | |
# through 2005. Use matrix(x, ...) to | |
# create a 5x4 matrix representing the | |
# years per row and quarter per column | |
# 4. Manipulate the dimnames attribute | |
# appropriately to give context | |
############################################## | |
# Lists and Data Frames ------------------------------------ | |
d = dimnames(s) | |
d | |
class(d) | |
mode(d) | |
length(d) | |
d[1] # List Point Index (Returns List) | |
d["Rows"] # Named List Point Index | |
d$Rows # Named Accessor (Access List Data) | |
d[[1]] # Index Accessor | |
d[c(1, 3)] # Returns List | |
d[[c(1, 3)]] # DON'T DO THIS (unless you know what you're doing) | |
x <- list("A" = 1:6, "B" = rnorm(6), "C" = gl(2, 3)) | |
y <- data.frame(A = 1:6, B = rnorm(6), C = gl(2, 3)) | |
x | |
y | |
x$C | |
y$C | |
class(x) # List | |
class(y) # Data Frame | |
mode(x) # List | |
mode(y) # List!! | |
as.data.frame(x) # DF = "Named List with Equal Length Elements" | |
print.data.frame # Class Dispatching (method).(class) | |
attributes(x) # Simple Object | |
attributes(y) # Complex Object - No Dimensions! | |
dim(y) # But it has dimensions | |
dim.data.frame # Special dim function for data frames | |
.row_names_info # Hidden function | |
.row_names_info(y, 2L) # data frame row count | |
nrow(y) # Uses dim function; gets dispatched! | |
length(y) | |
dim(x) <- dim(y) # Can you? ... | |
x[1] # List Index | |
y[1] # Column Index | |
x[[1]] # List Accessor | |
y[[1]] # Column Accessor | |
y[, 1] # Dimensional Index IS Accessor | |
x[, 1] # Nonsense! | |
y[1:3, 1] | |
y[1:3, c("A", "B")] | |
y[1:3, 1, drop = FALSE] | |
y[, 1][1:3] | |
############################################ | |
# Knowledge Check 6 | |
# Make a Data Frame. Make a List. Go crazy. | |
# Any Questions? | |
############################################ | |
# Conditional Indexing and Filtering (Subsetting) ------------------------- | |
x <- ChickWeight | |
head(ChickWeight) | |
!TRUE | |
TRUE & F | |
any(c(T, T, F)) | |
all(c(T, T, F)) | |
subset(x, Chick == 1) | |
x$Chick == 1 | |
which(x$Chick == 1) | |
x[x$Chick == 1, 'weight', drop = FALSE] | |
subset(x, Chick == 1, select = weight) | |
subset(x, Chick == 1, select = weight, drop = TRUE) | |
# Import and Exporting Data ----------------------------------------------- | |
library(help = "datasets") | |
ls() | |
rm(list = ls()) | |
ls() | |
data(mtcars) # Bring Package data sets to environment | |
ls() | |
class(mtcars) | |
dimnames(mtcars) | |
str(mtcars) # Structure of object | |
mtcars | |
write.table(mtcars, file = "mtcars.tsv", | |
sep = "\t", row.names = TRUE) | |
list.files() | |
getwd() | |
(infile <- file.choose()) | |
x <- read.delim(infile, header = TRUE) | |
str(x) | |
head(x) | |
tail(x) | |
x <- read.delim(infile, row.names = NULL) | |
x | |
idx <- grep("merc", x$row.names, ignore.case = TRUE) | |
grepl("Merc", x$row.names) | |
x[idx, ] | |
(idx <- grep("Merc", x$row.names, value = TRUE)) | |
x$row.names %in% idx # This in That | |
subset(x, !row.names %in% idx) # Everything BUT those ... | |
############################################################ | |
# Knowledge Check 7 (Homework) | |
# 1. Import spreadsheet table using read_excel (readxl) | |
# 2. Import spreadsheet table using read.delim("clipboard") | |
# 3. (Advanced) Import/Export using xlsx package | |
# - Requires some setup. See | |
# http://www.r-statistics.com/2012/08/how-to-load-the-rjava-package-after-the-error-java_home-cannot-be-determined-from-the-registry/ | |
############################################################ | |
# The R Environment ------------------------------------------------------- | |
ls() # The Workspace (Environment) | |
search() # The R "Path" (How Expressions are resolved) | |
help(package = "utils") # Package documentation | |
help("read.table") # Function documentation | |
?plot # Generic Function | |
??plot # Search documentation | |
library(splines) # Load Another Package | |
search() # Changed Search Path | |
detach(package:splines) # Why would they do this to us?! | |
search() | |
# Consider install.packages(pacman) | |
# p_load(c(MASS, splines, dplyr)) | |
# p_unload(c(MASS, splines, dplyr)) | |
# Open another R Session | |
install.packages(c("dplyr", "ggplot2", "reshape2")) | |
# RECAP ------------------------------------------------------------------- | |
# Objectives | |
# 1. R Data Types | |
# 2. Indexing | |
# 3. Boolean Logic and Filtering | |
# 4. Importing/Exporting | |
# 5. The R Environment | |
# | |
# Functions Used | |
# | |
# Constructors: c, list, data.frame | |
# Coercion: as.data.frame | |
# sequences: seq, : | |
# vectorized: sum | |
# Assignment: <-, = | |
# Object: class, mode, length, str, attributes, attr | |
# Dimensions: dim, nrow, ncol | |
# Names: names, dimnames, rownames, colnames | |
# Random: sample | |
# Logical: any, all, &, |, %in% | |
# Filtering: subset, which, | |
# Package: library, install.packages | |
# Environment: ls, rm, data, getwd, search, detach | |
# File: file.choose, read.delim, write.table, list.files | |
# Summary: head, tail | |
# Patterns: grep, grepl | |
# Help: help, ?, ?? |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
########################### | |
# Lesson Three - Data Viz | |
# | |
# 1. R (Studio) Projects | |
# 2. Base Graphics | |
# 3. Grammar of Graphics (ggplot) | |
########################### | |
# RStudio Projects -------------------------------------------------------- | |
# Demo Only | |
# Base Graphics ----------------------------------------------------------- | |
# A must read: http://www.statmethods.net/advgraphs/parameters.html | |
x <- airquality | |
hist(x$Ozone, main = "Ozone Distribution", xlab = "Ozone") | |
boxplot(x$Solar.R, ylab = "Solar Radiation", sub = "Subtitle") | |
summary(x$Solar.R) | |
boxplot(x[1:4]) | |
title("Air Quality Boxplot") | |
plot(rnorm(100), pch = "+", col = "steelblue") | |
abline(h = 0, col = 'indianred') | |
points( | |
jitter(sample(20:60, 100, TRUE)), | |
jitter(sample(-2:2, 100, TRUE)), | |
pch = 20) | |
plot(Ozone ~ jitter(Temp), x, col = "gray40") | |
lm1 <- lm(Ozone ~ Temp, x) # Linear Regression Model | |
lo1 <- loess(Ozone ~ Temp, x, span = 0.5) # Loess Model | |
abline(lm1, col = 'steelblue', lwd=2, lty=2) | |
s <- do.call(seq, as.list(range(x$Temp))) | |
print(s) | |
predict(lm1, data.frame(Temp = s)) | |
predict(lo1, s) | |
lines(s, predict(lo1, s), col = 'indianred', lwd=2) | |
hist(x$Ozone, freq = FALSE, main = "Ozone Density", xlab = "") | |
lines(density(na.omit(x$Ozone)), col = 'indianred', lwd=2) | |
# Base Graphics Hard ------------------------------------------------------ | |
x <- economics | |
plot(unemploy ~ date, x, type ='l', las = 1, | |
xlab = "Time", ylab = "Count", main = "Unemployment") | |
plot(psavert ~ date, x, type = 'l', las=1, | |
xlab = "Time", ylab = "Rate (%)", main = "Personal Savings") | |
par(mfrow = c(1, 2), mar = c(5, 5, 4, 1)+0.1) | |
plot(unemploy ~ date, x, type = 'l', las = 1, | |
ylab = "Count", main = "Unemployment", | |
yaxt = 'n', xlab = '', mgp = c(4,1,0)) | |
axis(side = 2, at = axTicks(2), las = 1, | |
labels = format(axTicks(2), big.mark = ",")) | |
mtext("Time", side = 1, line = 3) | |
plot(psavert ~ date, x, type = 'l', las = 1, | |
xlab = "Time", ylab = "Rate (%)", main = "Personal Savings") | |
# Grammar of Graphics ----------------------------------------------------- | |
library(ggplot2) | |
library(reshape2) | |
ggplot(x) + aes(date, unemploy) + geom_line() + theme_bw() | |
p <- ggplot(x) + aes(date) + theme_bw() | |
p + geom_line(aes(y=unemploy)) + ylab("Count") | |
p + geom_line(aes(y=psavert)) + ylab("Rate (%)") | |
x <- melt(x, id.vars = "date") | |
head(x) | |
head(dcast(x, date ~ variable, value.var = "value")) | |
ggplot(x) + aes(date, value) + geom_line() + | |
facet_wrap(~ variable) + theme_bw() | |
ggplot(subset(x, variable %in% c("psavert", "unemploy"))) + | |
aes(date, value) + geom_line() + theme_bw() + | |
facet_wrap(~ variable, scales = "free_y") | |
# See Also | |
# dplyr, tidyr | |
# RECAP | |
# | |
# Objectives | |
# 1. R (Studio) Projects | |
# 2. Base Graphics | |
# 3. Grammar of Graphics (ggplot) | |
# | |
# Functions used | |
# | |
# plot, hist, boxplot | |
# points, lines, abline | |
# par, title, mtext, axis, axTicks | |
# lm, loess, density | |
# ggplot, aes, geom_line, theme_bw, facet_wrap, ylab | |
# melt, dcast |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
##################################### | |
# Lesson 2 - Data Wrangling | |
# | |
# Learning Objectives | |
# 1. Iterations | |
# 2. User-Defined Functions (UDFs) | |
# 3. Data Profiling | |
# 4. Multiple Assignments | |
# 5. Missing Values | |
##################################### | |
# Iterative Processing ---------------------------------------------------- | |
(v = sample(1:10, 10, TRUE)) | |
x = vector("numeric", 10) # Pre-allocate result vector | |
for (n in seq_along(v)) | |
{ | |
if (v[n] %% 2 == 0) # If v[n] is Even | |
{ | |
x[n] = v[n] / 2 | |
} else | |
{ | |
x[n] = v[n]^2 | |
} | |
} | |
square = function(x) {x^2} | |
halve = function(x) {x/2} | |
ifelse(v %% 2 == 0, halve(v), square(v)) # Vectorized | |
x | |
f = function(n = 10) | |
{ | |
x = vector("numeric", n) | |
for (i in seq_along(x)) | |
x[i] <- rnorm(1) | |
return (x) | |
} | |
f() | |
rnorm(10) | |
################################################# | |
# Knowledge Check 1 | |
# 1. Define 2 functions that manipulate integers | |
# 2. Use ifelse logic to apply 1 function to | |
# each even *position* in the vector x and | |
# use the other function to each odd position | |
################################################# | |
# Data Profiling ---------------------------------------------------------- | |
x <- mtcars | |
y <- CO2 | |
head(x) | |
str(x) | |
head(y) | |
str(y) | |
summary(x) | |
summary(y) | |
table(x$cyl) | |
table(y$Plant) | |
table(x$cyl) / nrow(x) | |
prop.table(table(x$cyl)) | |
# More vectorization | |
apply(x, 2, summary) | |
apply(x, 1, mean) | |
is.na(x) | |
any(is.na(x)) | |
lapply(airquality, function(x) any(is.na(x))) | |
any_na = function(x) any(is.na(x)) | |
sapply(airquality, any_na) | |
x <- scale(airquality) | |
head(x) | |
str(x) | |
x <- airquality | |
x[] <- scale(airquality) | |
head(x) | |
str(x) | |
x <- mtcars | |
x[] <- lapply(x, function(x) x - mean(x, na.rm = TRUE)) | |
head(x) | |
head(scale(mtcars, TRUE, FALSE)) | |
################################################ | |
# Knowledge Check 2 (Homework) | |
# 1. Create a function to count distinct values of a | |
# vector and apply it to all columns its relevant to | |
# 2. Explore tapply and by for doing group-wise *apply | |
# operations | |
# 3. (Challenge) Create a summary function and apply it | |
# to a column or columns by some group. Feel free to | |
# use what you do in 1 and 2 directly | |
################################################ | |
# Factors and Relabeling -------------------------------------------------- | |
# For more, see the recode or Recode (car package) | |
# More user-friendly string functions see stringr package | |
paste("Q", 1:5, sep = ".") | |
x <- data.frame( | |
Question = rep(paste0("Q", 1:20), each = 10), | |
Response = sample(1:7, 20*10, replace = TRUE) | |
) | |
head(x) | |
x$Subject <- rep(1:10, length.out = 20*10) | |
head(x) | |
with(x, table(Subject, Question)) | |
str(x) | |
x <- transform(x, Response_f = factor(Response)) | |
str(x) | |
levels(x$Response_f) | |
levels(x$Response_f) <- c(rep("Low", 3), "Neutral", rep("High", 3)) | |
levels(x$Response_f) | |
with(x, table(Response, Response_f)) | |
x$Response <- as.character(x$Response_f) | |
table(x$Response) | |
x <- mtcars | |
x$row.names <- rownames(x) | |
x[grepl("Merc", x$row.names), 'cyl'] <- 99 | |
x[grep("\\d", x$row.names), 'row.names'] <- 'NUMBER' | |
# Also grep("[0-9]", ...) works, too | |
View(x) | |
# Missing Values ---------------------------------------------------------- | |
x <- trees | |
N <- nrow(x) | |
x_full <- x | |
x$Girth[sample(1:N, 6)] <- NA | |
x$Height[sample(1:N, 6)] <- NA | |
x$Volume[sample(1:N, 6)] <- NA | |
x | |
# Solution 1 - Averaging | |
means <- lapply(x, mean, na.rm= TRUE) # colMeans | |
x$Girth[is.na(x$Girth)] | |
x$Girth[is.na(x$Girth)] <- means$Girth | |
print(x) | |
cat("Error: ", sum((x$Girth - x_full$Girth)^2)) | |
############################################ | |
# Knowledge Check 3 (Challenging) | |
# 1. Define a function that | |
# a. Computes the avg of a vector | |
# b. Indexes the missing values of a vector | |
# c. Replaces the missing values with the avg | |
# 2. *Apply* your function to each column of x | |
# 3. (Extra) Compute the error for each column and overall | |
############################################ | |
# Solution 2 - Imputation | |
x <- x_full | |
x$Girth[sample(1:N, 6)] <- NA | |
fit <- lm(Girth ~ Height + Volume, x) | |
summary(fit) | |
coef(fit) | |
missing <- x[is.na(x$Girth), -1] | |
predict(fit, missing) | |
x[is.na(x$Girth), 1] <- predict(fit, missing) | |
cat("Error: ", sum((x$Girth - x_full$Girth)^2)) | |
########################################## | |
# Knowledge Check 4 (Homework) | |
# Create a function that takes in a data frame | |
# and a formula, imputes the missing values | |
# of the dependent (LHS) variable using the | |
# indicated predictors (RHS) variables | |
########################################## | |
# RECAP | |
# Objectives | |
# Learning Objectives | |
# 1. Iterations | |
# 2. User-Defined Functions (UDFs) | |
# 3. Data Profiling | |
# 4. Multiple Assignments | |
# 5. Missing Values | |
# | |
# Functions Used | |
# | |
# Construction: vector, rep, seq_along | |
# Mathematical: %%, /, ^ | |
# Control Flow: function, for, if else | |
# Vectorized: summary, scale, paste, is.na , ifelse | |
# Tabulation: table, prop.table | |
# Iterators: apply, lapply, sapply | |
# Manipulation: transform, as.character, with | |
# Factors: factor, levels | |
# Models: lm, summary.lm, coef, predict | |
# Other: cat, rnorm |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment