Last active
August 29, 2015 14:23
-
-
Save jaehyeon-kim/fcd5287e3ea3f5638e02 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
### Subsetting applications | |
## lookup tables (character subsetting) | |
x <- c("m","f","u","f","f","m","m") | |
lookup <- c(m = "Male", f = "Female", u = NA) | |
lookup[x] | |
unname(lookup[x]) | |
## matching and merging by hand (integer subsetting) | |
grades <- c(1, 2, 2, 3, 1) | |
info <- data.frame(grade = 3:1, desc = c("Excellent", "Good", "Poor"), fail = c(F, F, T)) | |
# using match | |
id <- match(grades, info$grade) | |
info[id, ] | |
# using rownames | |
rownames(info) <- info$grade | |
info[as.character(grades), ] | |
# using merge | |
merge(data.frame(grade = grades), info, by = "grade", all.x = TRUE) | |
# by multiple columns | |
# interaction() for factors and collapse() for others | |
## random samples/bootstrap (integer subsetting) | |
df <- data.frame(x = rep(1:3, each = 2), y = 6:1, z = letters[1:6]) | |
set.seed(10) | |
df[sample(nrow(df)), ] | |
df[sample(nrow(df), 3), ] | |
df[sample(nrow(df), 6, rep = T), ] | |
## ordering (integer subsetting) | |
x <- c("b", "c", "a") | |
order(x) | |
x[order(x)] | |
set.seed(10) | |
df2 <- df[sample(nrow(df)), 3:1] | |
df2[order(df2$x), ] | |
df2[order(names(df2))] | |
df2[, order(names(df2))] | |
## expanding aggregated counts (integer subsetting) | |
df <- data.frame(x = c(2, 4, 1), y = c(9, 11, 6), n = c(3, 5, 1)) | |
rep(1:nrow(df), df$n) | |
df[rep(1:nrow(df), df$n), ] | |
## removing columns from data frames (character subsetting) | |
df <- data.frame(x = 1:3, y = 3:1, z = letters[1:3]) | |
df$z <- NULL | |
df[["z"]] <- letters[1:3] | |
df$z <- NULL | |
df["z"] <- list(letters[1:3]) | |
df[["z"]] <- NULL | |
df[["z"]] <- letters[1:3] | |
df[setdiff(names(df), "z")] | |
## selecting rows based on a condition (logical subsetting) | |
mtcars[mtcars$gear == 5, ] | |
subset(mtcars, gear == 5) | |
# don't use short circuit operators (&& or ||) | |
subset(mtcars, gear == 5 & cyl == 4) | |
## boolean algebra vs sets (logical & integer subsetting) | |
# integer subsetting is more effective when | |
# 1. to find the first (or last) TRUE | |
# 2. very few TRUEs <- faster and less storage required | |
set.seed(10) | |
x <- sample(10) < 4 | |
# boolean to integer | |
which(x) | |
unwhich <- function(x, n) { | |
out <- rep_len(FALSE, n) | |
out[x] <- TRUE | |
out | |
} | |
unwhich(which(x), 10) | |
x1 <- 1:10 %% 2 == 0 | |
x2 <- which(x1) | |
y1 <- 1:10 %% 5 == 0 | |
y2 <- which(y1) | |
x1 & y1 | |
intersect(x2, y2) | |
x1 | y1 | |
union(x2, y2) | |
x1 & !y1 | |
setdiff(x2, y2) | |
xor(x1, y1) | |
setdiff(union(x2, y2), intersect(x2, y2)) | |
#How would you randomly permute the columns of a data frame? | |
#(This is an important technique in random forests.) | |
#Can you simultaneously permute the rows and columns in one step? | |
set.seed(10) | |
var0 <- sample(1:10, 10) | |
var1 <- sample(10:1, 10) | |
var2 <- sample(20:11, 10) | |
var3 <- sample(30:21, 10) | |
var4 <- sample(40:31, 10) | |
df <- data.frame(var0 = var0, var1 = var1, var2 = var2, var3 = var3, var4 = var4) | |
df[sample(1:nrow(df), as.integer(nrow(df)/2)), sample(names(df), 3)] | |
#How would you select a random sample of m rows from a data frame? | |
#What if the sample had to be contiguous | |
#(i.e., with an initial row, a final row, and every row in between)? | |
rows <- c(rownames(df)[1], sample(rownames(df), 5), rownames(df)[length(rownames(df))]) | |
df[rows, ] | |
#How could you put the columns in a data frame in alphabetical order? | |
df <- as.data.frame(outer(1:2, 1:10)) | |
df[order(names(df))] |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment