Created
June 9, 2017 14:36
-
-
Save dantalus/3aa72448c0678be9adb277330ccc1e94 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Apply functions #### | |
# There are cases where you want to "do something" to each element in a given | |
# data structure. For example, we might want to calcuate the | |
# mean for each variable (column) in a dataframe. | |
# Looping is a common way to do this. | |
# Data | |
# library(tidyverse) | |
data <- iris | |
View(data) | |
# Print the mean for the first 4 columns of data | |
for (i in 1:4){ | |
print(mean(data[[i]], na.rm = TRUE)) | |
} | |
# Or something slightly more complicated | |
# Create a new dataframe made up of the standardized values for first 4 columns | |
# of data. | |
results <- list() # Create a "blank" list | |
for (i in 1:4){ | |
m <- mean(data[[i]], na.rm = TRUE) | |
sd <- sd( data[[i]], na.rm = TRUE) | |
results[[i]] <- (data[[i]] - m) / sd # Put results in the list | |
} | |
results <- do.call(cbind.data.frame, results) # Convert the list to dataframe | |
library(dplyr) | |
library(tidyr) | |
# Plot the original data | |
gather(data[, 1:4], var, value) %>% | |
ggplot(aes(x = value, color = var, fill = var)) + | |
geom_density() + | |
facet_wrap(~var) | |
# Plot the standardized values | |
gather(results, var, value) %>% | |
ggplot(aes(x = value, color = var, fill = var)) + | |
geom_density() + | |
facet_wrap(~var) | |
# The argument against loops - Just google "Why shouldn't I use for loops r" | |
# for a deluge of reasons. I use for loops all the time, and you probably | |
# will/should too, but the basic arguments against them are speed and clarity of | |
# code. | |
# One of the strengths of R is vectorization. | |
# For example, if I want to divide each value of a numeric vector by 2, I don't | |
# need a for loop that goes through each element of the vector, doing the | |
# calcuation as I go. | |
v <- c() | |
for(i in seq_along(data$Sepal.Length)){ | |
v[i] <- data$Sepal.Length[i] / 2 | |
} | |
v | |
# I just do this: | |
data$Sepal.Length / 2 | |
# Apply functions are basically tools that take advatange of this vectorization | |
# to (sometimes) produce faster calcuations. | |
# They are also usually more consise to write. As the name suggests, they apply | |
# a function to each element of an object. | |
# The trick to apply functions is to know which type of object goes in and what | |
# comes out. | |
?apply | |
m <- matrix(1:9, 3, 3) | |
m | |
apply(m, 2, mean) | |
apply(m, 1, mean) | |
apply(data[1:4], 2, mean) # Ok this works | |
apply(data, 2, class) # The result here doesn't make sense. What happened? | |
as.matrix(data) | |
# Apply works on arrays/matrices, which must all contain the same kind of data. | |
# The dataframe however includes a mixture of double and character data types. | |
# So apply converted your dataframe into a character matrix, so the class for | |
# each column is also character. | |
# For for dataframes, which are a special kind of list, we use lapply | |
# ("l"apply = list apply). | |
x <- lapply(data, class) | |
x | |
class(x) | |
table(x) | |
table(unlist(x)) | |
# The output of lapply is also a list, so using a function like table on it won't give good results. | |
# So instead you can use sapply. It's just like lappy, but it "S"implifies the | |
# output. | |
x <- sapply(data, class) | |
x | |
class(x) | |
table(x) | |
# I often use sapply to identify columns in a dataframe based on some | |
# characteristcs of the data. | |
# One way to subset a dataframe by columns is to index with a logical vector, | |
# where you keep the columns i that correspond to i = TRUE in a logical vector. | |
# For example, if I want the first 4 columns, but not the fifth, I could do this: | |
data[c(TRUE, TRUE, TRUE, TRUE, FALSE)] %>% View() | |
# That's obviously very tedious. This is better: | |
sapply(data, is.numeric) | |
data[sapply(data, is.numeric)] %>% View() | |
# You can also set up conditional statements resulting in a logcal vector like this: | |
data[sapply(data, class) == "numeric"] | |
# Exercise: Take the following dataframe, and consisely convert the c(1, 2) | |
# variables to a factor with the labels 1 = "No" and 2 = "Yes". | |
# It has 1000 variables, and those with data = c(1, 2) are randomly scattered | |
# throughout. | |
# Generate the data | |
fake <- list() | |
for(i in 1:1000){ | |
flip <- sample(c(1:4), 1) | |
if(flip == 1){fake[[i]] <- rnorm( 50)} | |
if(flip == 2){fake[[i]] <- sample(c(1, 2), 50, replace = TRUE)} | |
if(flip == 3){fake[[i]] <- sample(letters, 50, replace = TRUE)} | |
if(flip == 4){fake[[i]] <- sample(c(TRUE, FALSE), 50, replace = TRUE)} | |
} | |
data <- do.call(cbind.data.frame, fake) | |
names(data) <- paste0("V", c(1:length(data))) | |
# Here are the columns with min = 1, max = 2 and number of reponses = 2 | |
data[sapply(data, function(x) min(as.numeric(x))) == 1 & | |
sapply(data, function(x) max(as.numeric(x))) == 2 & | |
sapply(data, function(x) length(table(as.numeric(x)))) == 2] %>% View() | |
# How did this work? Break it down. | |
sapply(data, function(x) min(as.numeric(x))) | |
sapply(data, function(x) min(as.numeric(x))) == 1 | |
sapply(data, function(x) max(as.numeric(x))) | |
sapply(data, function(x) max(as.numeric(x))) == 2 | |
sapply(data, function(x) length(table(as.numeric(x)))) | |
sapply(data, function(x) length(table(as.numeric(x)))) == 2 | |
# Now replace those data with the factors | |
# Create the logical vector | |
these <- sapply(data, function(x) min(as.numeric(x))) == 1 & | |
sapply(data, function(x) max(as.numeric(x))) == 2 & | |
sapply(data, function(x) length(table(as.numeric(x)))) == 2 | |
# Use lapply to turn those into factors | |
data[these] <- lapply(data[these], | |
factor, | |
levels = c(1, 2), labels = c("No", "Yes")) | |
View(data) | |
y <- rnorm(100, 3, 4) | |
normalize <- function(x, ...){ | |
m <- mean(x, na.rm = TRUE) | |
x <- (x - m) / sd(x, na.rm = TRUE ) | |
return(x) | |
} | |
normalize(y) | |
# Regular expressions #### | |
x <- sample(c(" test ", "test"), 1000, replace = TRUE) | |
table(x) | |
x <- gsub("^\\s+|\\s+$", "", x) # trailing/leading space | |
x <- sample(c("??test", "!>test", "test&*"), 1000, replace = TRUE) | |
x <- gsub("[[:punct:]]", "", x) | |
x <- gsub(" ", ".", x) | |
x <- gsub("^[[:digit:]]", "", x) | |
x <- gsub(" ", ".", x) | |
x <- gsub("\\/", ".", x) | |
x <- gsub("\\,", "", x) | |
x <- gsub("\\?", "", x) | |
as.character(gsub("^\\s+|\\s+$", "", x)) # lead, trailing white space | |
x <- make.names(x, unique = TRUE) | |
x <- c("9as", "0bn") | |
sub("^([0-9])(.+)", "\\2\\1", x) # Move digit from front to end | |
x <- c("bob45", "bob56", "sarah67") | |
x[grepl("bob", x)] <- "John" | |
x | |
# This captures a number of any length \\d+ in the () as \\1, and then puts . in | |
# front what was captured. | |
x <- sub("(\\d+)", "\\.\\1", x) | |
# Remove . at end of string | |
x <- gsub("\\.$", "", x) | |
# keep only the digit from a string | |
gregexpr("[[:digit:]]+", data$x) %>% | |
regmatches(data$x, .) %>% | |
unlist() %>% | |
as.numeric() | |
# select values that match string in column | |
data$x[grepl("x", data$x)] <- "X" | |
# Keep only the first digit | |
data$x <- sub("([0-9]{1}).*", "\\1", data$x) | |
# Keep a match | |
# Keep everything after a given character | |
data$x <- regmatches(data$x, regexpr("([^XXX]*$)", data$x)) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment