Created
June 8, 2017 09:34
-
-
Save dantalus/cfbfeb35e4c25b292b41ec401e063937 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
install.packages("tidyverse") | |
library(tidyverse) | |
# Objects #### | |
# Most of R, from an applied point of view anyway, is the process of creating | |
# objects and feeding them into functions to make amazing, new objects. | |
# amazing_new_object <- f(object) | |
x <- c(3, 4, 5) | |
y <- mean(x) | |
# This is true in the big picture sense as well | |
# information | |
# dataframe <- f(information) | |
# plot <- f(dataframe) | |
# model <- f(dataframe) | |
# table <- f(model) | |
# report <- f(plot, table) | |
# But before we get to that point... | |
# The first objects we typically work with can be described as data structures, | |
# and these can hold different types of data: | |
typeof(1) # double | |
typeof("Yes") # character | |
typeof(FALSE) # logical | |
# Missing values are represented by NA | |
c(1, 2, NA, 4) | |
# Types of data structures: | |
# Vectors - a one dimensional set of values that must all be of the same type. | |
c(1, 2, 3, 3.277465) # All numbers | |
c("Yes", "No", "1") # All characters | |
c(FALSE, TRUE) # All logical | |
c(F, T) # All logical (Warning - spell them out) | |
# Matrices/Arrays - two or more dimensions | |
m <- matrix(1:9, 3, 3) | |
dim(m) | |
m | |
# Vectors, matrices, and arrays can only contain 1 kind of data. This is | |
# important to understand. | |
# What happens when we create a vector with multiple types of data? | |
v <- c(1, TRUE, "Yes") | |
typeof(v) | |
v | |
# You can see that the 1 and TRUE were converted to "1" and "TRUE" | |
# Lists are a special type of vector that allow us to combine different types of | |
# data. | |
v <- list(1, TRUE, "Yes") | |
typeof(v[[1]]) # double | |
typeof(v[[2]]) # logical | |
typeof(v[[3]]) # character | |
typeof(v) # list | |
class(v) | |
class(1) | |
typeof(c(1, 2.2)) | |
class(c(1, 2.2)) | |
typeof(c(1, 2)) | |
class(c(1, 2)) | |
typeof(1:3) | |
typeof(c(1:3)) | |
typeof(c(1, 2, 3)) | |
# Dataframes are in turn special types of lists that correspond to the concept | |
# of a dataset (a rectangular matirx of values, with observations in rows and | |
# variables in columns, perhaps with some labels or other metadata attached) | |
data <- data.frame(number = rnorm(50), | |
char = sample(letters, 50, replace = TRUE), | |
logic = sample(c(TRUE, FALSE), 50, replace = TRUE)) | |
View(data) | |
save(data, v, m, x, y, file = "data.RData") # Save an object of set of objects | |
rm(list=ls()) # Remove all the objects in the environment | |
load("data.RData") # Bring those objects back | |
# There are many functions to help us better understand objects | |
mode(data) | |
typeof(data) | |
class(data) | |
class(data) <- c(class(data), "bob") # We can assign classes | |
str(data) # Structure | |
reg <- lm(number ~ char, data = data) # A nonsense regression model | |
reg | |
summary(reg) | |
str(reg) | |
typeof(reg) | |
class(reg) | |
View(data) | |
utils::View(data) # When you want an un-constricted view | |
names(data) | |
attributes(data) | |
dim(data) | |
length(data) | |
nrow(data) | |
ncol(data) | |
is.character("x") | |
is.numeric(1) | |
x <- factor(c(1, 2)) | |
is.factor(x) | |
is.logical(FALSE) | |
is.na(c(1, 2, NA, 4)) | |
# ! will reverse logical values | |
!is.na(c(1, 2, NA, 4)) | |
# Subsetting #### | |
# Part of working with R is being table to take apart objects and rearrange the | |
# parts. | |
# Indexing | |
# One-dimension | |
vec <- sample(c(0:9), 100, replace = TRUE) | |
vec[2] | |
# Two dimensions | |
mat <- matrix(c(1, 2, 3, 3, 2, 1), ncol = 2) | |
matrix[1, 2] | |
# Lists | |
x <- list(a = c(1, 2), b = c(4, 4), c = c(6, 8), d = c(9, 11)) | |
x[[1]] | |
x[[1]][1] | |
x[1] | |
# These give different results | |
class(x[[1]]) | |
str(x[[1]]) | |
class(x[1]) | |
str(x[1]) | |
attributes(x[1]) | |
# $ for named elements in a list | |
x$a | |
class(x$a) | |
x$a[1] | |
# Selecting multiple elements | |
x <- letters | |
x[c(1, 2, 6)] | |
x <- sample(c(0:9), 100, replace = TRUE) | |
x[x < 5] | |
x < 5 | |
d <- data_frame(number = sample(0:9, 100, replace = TRUE), | |
character = rep(c("a", "b"), 50)) | |
lapply(d, class) | |
d[unlist(lapply(d, is.numeric))] %>% head() | |
d[ sapply(d, is.numeric)] %>% head() | |
# You can name elements in data structures besides lists. | |
x <- c( 1, 4, 6, 9) | |
str(x) | |
x <- c(a = 1, b = 4, c = 6, d = 9) | |
str(x) | |
names(x) | |
attributes(x) | |
attr(x, "description") <- "This is a named vector" | |
attributes(x) | |
# But $ only works with lists | |
x$a | |
x <- list(a = 1, b = 4, c = 6, d = 9) | |
x$a | |
# Making and combining objects #### | |
?c | |
?matrix | |
?array | |
?list | |
seq_along(c(1:20)) | |
seq_along(c(100:120)) | |
seq(from = 0, to = 100, by = 10) | |
seq(0, 100, 10) | |
rep(c(1, 2), times = 100) | |
rep(c(1, 2), each = 100) | |
# Combining different data types can be tricky | |
# Differnt data types willl typically reduce to the type with the lowest level | |
# of information | |
x <- c(1, "character") | |
x | |
class(x) | |
x <- c(1, TRUE, "character") | |
x | |
class(x) | |
x <- c(1, TRUE, FALSE) | |
x | |
class(x) | |
# No problem with a list | |
x <- list(1, TRUE, "character") | |
x | |
class(x) | |
# Vectors can be combined to make matrices, but be careful | |
# R will extend a shorter vector to match a longer one, thus creating data you | |
# you might not expect. | |
?rbind | |
length(rbind(sample(0:9, 100, replace = TRUE))) | |
length(c("a", "b")) | |
m <- rbind(sample(0:9, 100, replace = TRUE), | |
c("a", "b")) | |
class(m) | |
View(m) | |
# Dataframes will prevent you from doing this | |
m <- rbind(sample(0:9, 100, replace = TRUE), | |
c("a", "b")) %>% as.data.frame() # Not this way | |
m <- data_frame(sample(0:9, 100, replace = TRUE), | |
c("a", "b")) # Error, which is correct | |
# We can also combine by columns | |
cbind? | |
m <- cbind(sample(0:9, 100, replace = TRUE), | |
sample(letters, 100, replace = TRUE)) | |
class(m) | |
View(m) | |
# There are other functions to help switch between information types | |
as.character(1) | |
as.numeric("1") | |
as.numeric("dog") | |
as.factor(1) | |
# We can put strings together with paste. | |
paste(letters, "hello", sep = "_") | |
paste(letters, letters, sep = "_") | |
paste(letters, c("yes", "no"), sep = "_") | |
paste0(letters, "hello") | |
# We can sample and simulate data | |
sample(letters, size = 100, replace = TRUE) | |
sample(letters, size = 10, replace = FALSE) | |
rnorm(10, mean = 0, sd = 1) %>% qplot() | |
rnorm(10000, 0, 1) %>% qplot() | |
d <- data_frame(A = sample(c(0:9), size = 100, replace = TRUE), | |
B = sample(c(0:9), size = 100, replace = TRUE), | |
C = sample(c(0:9), size = 100, replace = TRUE), | |
D = sample(c(0:9), size = 100, replace = TRUE)) | |
d$total <- d$A + d$B + d$C + d$D | |
d$total2 <- rowSums(d[c(1:4)]) | |
d$mean <- d$total / 4 | |
d$mean2 <- rowMeans(d[c(1:4)]) | |
ggplot(d, aes(x = mean, y = mean2)) + geom_point() | |
x <- c(c(1:10), rep(c(1, 2, 3), each = 2)) | |
x[duplicated(x)] | |
x <- c(c(1:10), rep(c(1, 2, 3), each = 2)) | |
x[unique(x)] | |
length(x) - length(unique(x)) | |
# Factors #### | |
# Factors are a special kind of numeric variable with labels attached to each | |
# value, signifying categorical (nominal, ordered) data. | |
f <- sample(c("Yes", "No", "Maybe"), size = 100, replace = TRUE, | |
prob = c(0.3, 0.6, 0.1)) | |
f.1 <- factor(f) | |
# The "levels" are the labels | |
levels(f.1) | |
table(f.1) | |
# Confirm the structure | |
str(f.1) | |
# The underlying numbers: | |
table(as.numeric(f.1)) | |
# The order of the levels matters. By default, they will be in alphabetial order | |
sample(letters[c(1, 5, 8)], size = 100, replace = TRUE) %>% | |
factor() %>% | |
levels() | |
sample(letters[c(5, 8, 1)], size = 100, replace = TRUE) %>% | |
factor() %>% | |
class() | |
sample(letters[c(5, 8, 1)], size = 100, replace = TRUE) %>% | |
factor(levels = c("e", "h", "a")) | |
sample(letters[c(5, 8, 1)], size = 100, replace = TRUE) %>% | |
factor(levels = c("e", "h", "a"), ordered = TRUE) | |
sample(c(1, 2, 10, 20, 100), size = 100, replace = TRUE) %>% | |
factor() | |
sample(as.character(c(1, 2, 10, 20, 100)), size = 100, replace = TRUE) %>% | |
factor() | |
# Reordering levels | |
table(f.1) | |
levels(f.1) <- c("Yes", "No", "Maybe") | |
table(f.1) # Bad! | |
levels(f.1) <- rev(levels(f.1)) # Switch it back | |
table(f.1) | |
# Do it with factor() | |
f.1 <- factor(f, levels = rev(levels(f.1))) | |
table(f.1) # Correct | |
f.1 <- relevel(f.1, ref = "Maybe") | |
table(f.1) | |
table(as.numeric(f.1)) # Convert to the underlying number | |
# Do it manually | |
f.1 <- factor(f, levels = c("Maybe", "Yes", "No")) | |
table(f.1) # Correct | |
# You need to use the exising levels | |
f.1 <- factor(f, levels = c("A", "B", "C")) # Bad | |
f.1 <- factor(f) | |
f.2 <- factor(f, labels = c("A", "B", "C")) # Use the labels option | |
table(f.1, f.2) | |
levels(f.2) # The labels become the levels forevermore | |
# Numbers as factors | |
f <- sample(c(10, 20, 50, 60, 65, 90), size = 100, replace = TRUE) | |
f.1 <- factor(f) | |
levels(f.1) | |
str(f.1) | |
f.1 %>% as.numeric() %>% table() # No | |
as.numeric(levels(f.1)[f.1]) %>% table() # Yes | |
f.1 <- cut(f, 4) # Equally spaced levels | |
table(f.1) | |
str(f.1) | |
f.1 <- cut(f, 4, labels = c("Low", "Med", "High", "Very High")) | |
table(f.1) | |
levels(f.1) | |
as.character(f.1) | |
# ~ equally sized levels | |
f.1 <- cut(f, breaks = quantile(f, 0:4/4)) | |
table(f.1) | |
levels(f.1) | |
# User defined cuts | |
bmi <- rnorm(100, 2,6, 4) | |
qplot(bmi) | |
bmi <- cut(bmi, c(0, 18.5, 25, 30, max(bmi)), | |
labels = c("UW", "NW", "OW", "OB")) | |
table(bmi) | |
# Reordering levels based on other values | |
data <- data_frame(number = rnorm(100, 0, 1), | |
factor = factor(sample(letters[1:5], 100, replace = TRUE))) | |
levels(data$factor) | |
data <- group_by(data, factor) %>% | |
summarise(mean = mean(number)) %>% | |
full_join(data, by = "factor") | |
table(data$factor, data$mean) | |
ggplot(data, aes(x = factor, fill = mean)) + | |
geom_bar() | |
data$factor <- reorder(data$factor, data$mean) | |
levels(data$factor) | |
ggplot(data, aes(x = factor, fill = mean)) + | |
geom_bar() | |
data <- group_by(data, factor) %>% | |
summarise(count = n()) %>% | |
full_join(data, by = "factor") | |
data$factor <- reorder(data$factor, data$count) | |
ggplot(data, aes(x = factor, fill = mean)) + | |
geom_bar() | |
# Describing data #### | |
bmi <- rnorm(100, 26, 4) | |
mean(bmi) | |
min(bmi) | |
max(bmi) | |
quantile(bmi, 0.50, na.rm = TRUE) | |
quantile(bmi, seq(0.05, 0.95, by = 0.05), na.rm = TRUE) | |
bmi[bmi > 30 & bmi < 32 & !is.na(bmi)] <- NA | |
!is.na(bmi) | |
bmi[is.na(bmi)] %>% length() | |
mean(bmi) | |
mean(bmi, na.rm = TRUE) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment