Last active
April 3, 2017 12:52
-
-
Save leeper/c669fd4a6dcb9ac23612 to your computer and use it in GitHub Desktop.
Some notes on recoding
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # recoding is basically a function | |
| # x is a variable | |
| # f() is a function | |
| # f(x) is a recoding operation | |
| # | |
| # implementation | |
| # if x is discrete, then f() is simple to implement as a map between input values and output values: | |
| # 1 = 0 | |
| # 2 = 1 | |
| # | |
| # if x is continuous and f() is a simple transformation, it can also be simple, e.g.: | |
| # f(x) = x^2 | |
| # f(x) = (x - 1)/2 | |
| # | |
| # a major complication, however, is that R vectors have missing data, requiring all functions to either: | |
| # a) have a discrete argument to specify the translation of NAs | |
| # b) strictly pass missing values | |
| # | |
| # | |
| # we have lots of specialized language for specific coding operations | |
| # - top-coding: a function that plateaus above a given point in its domain | |
| # - bottom-coding: a function that plateaus below a given point in its domain | |
| # - reverse coding: a function replaces each value in a vector with the corresponding value in the reversed vector | |
| # - binning: a step function (that converts a continuous vector into a categorical vector | |
| # - relabeling: a function that takes discrete categories as input and returns other discrete categories | |
| # - centering: aligning a vector so that its central tendency is (typically) 0 | |
| # - scaling: a function of the form f(x) = a*x | |
| # - imputation: a function f(x) = x, except for the handling of missing values according to a rule | |
| # | |
| # | |
| # useful distinction? | |
| # - dimension reducing recode (`unbranch()`): collapse or recode multiple variables into one variable | |
| # - dimension expanding recode (`branch()`): expand one or more variables into more variables | |
| # | |
| # | |
| # An example: reverse coding in R | |
| x <- 1:5 | |
| ifelse(x == 1, 5, ifelse(x == 2, 4, ifelse(x == 3, 3, ifelse(x == 4, 2, ifelse(x == 5, 1, NA))))) | |
| -x + (max(x)+min(x)) | |
| x2 <- x | |
| x2[x == 1] <- 5 | |
| x2[x == 2] <- 4 | |
| x2[x == 3] <- 3 | |
| x2[x == 4] <- 2 | |
| x2[x == 5] <- 1 | |
| s <- scale(x, center = TRUE, scale = FALSE) | |
| -s[,1] + attributes(s)[["scaled:center"]] | |
| car::recode(x, "1=5;2=4;3=3;4=2;5=1") | |
| # better ways to do this? | |
| # the question from an R context is how to express a recoding function compactly | |
| # | |
| # car::recode() is quite compact, but it is optimized for discrete values not continuous variables | |
| # it also has terrible memory usage: | |
| x <- 1:5 | |
| tracemem(x) | |
| car::recode(x, "1=5;2=4;3=3;4=2;5=1") | |
| ## tracemem[0x000000000a3cf640 -> 0x000000000a3cb500]: <Anonymous> | |
| ## tracemem[0x000000000a3cf640 -> 0x000000000a034458]: <Anonymous> | |
| ## tracemem[0x000000000a3cb500 -> 0x000000000a017948]: <Anonymous> | |
| ## tracemem[0x000000000a3cf640 -> 0x0000000009fdd0e0]: <Anonymous> | |
| ## tracemem[0x000000000a3cf640 -> 0x0000000009fa21f8]: <Anonymous> | |
| ## tracemem[0x000000000a3cf640 -> 0x0000000009cca190]: <Anonymous> | |
| ## tracemem[0x000000000a3cf640 -> 0x0000000009bc4da8]: <Anonymous> | |
| # | |
| # | |
| # how do other languages do this? | |
| # Stata has two flavors of recoding | |
| # the first is `generate` and `replace` | |
| # gen x = . | |
| # replace x = 1 if (conditions) | |
| # | |
| # that is analogous to `x[logical] <- value` notation in R | |
| # The other flavor is `generate` and `recode` | |
| # gen x2 = x | |
| # recode x2 (min/5=1) (6=2) (7/10=3) (11/max=4) | |
| # | |
| # SPSS has the same two options: | |
| # recode x (lo thru 5=1) (6=2) (7 thru 10=3) (11 thru hi=4) | |
| # execute. | |
| # | |
| # compute mpg3 = 1. | |
| # if (mpg >= 19) & (mpg <= 23) mpg3 = 2. | |
| # if (mpg >= 24) & (mpg <= 100) mpg3 = 3. | |
| # execute. | |
| # | |
| # pandas copies R implementation | |
| # | |
| # | |
| # | |
| # there is also the complication of R vector classes | |
| # the domain of each major class (logical, integer, numeric, character, factor) differs | |
| # this means that recoding functions should be S3 generics | |
| # | |
| # | |
| # how should character class vectors be treated? | |
| # there's no finite domain for the recoding function | |
| # how then should any checking be performed? | |
| # how should the recodes be performed? Lookup in a hash table? regex? | |
| # | |
| # | |
| # factors require special care because they're treated as integer but display as character | |
| # thus why they have recoding-like functions (e.g., `relevel()`) already | |
| # | |
| # another problem is that the domain of a recoding function might not be fully represented in | |
| # the actual data vector that is being recoded. For example, the domain of a recode function | |
| # might be 1,2,3,4,5, but the data vector being recoded might only contain 1,3,4 | |
| # this means that using empirical values in the data vector to create the recoding function | |
| # (e.g., max(x)) will be wrong. We need to be able to create a recoding function independent of | |
| # the actual data vector being recoded. | |
| # | |
| # there are various ways to define a function: | |
| # representations that work better for continuous than discrete functions | |
| # as an actual R expression: f <- function(x) { x^2 }; f <- function(x) { -x + (max(x)+min(x)) } | |
| # as a string representation: f <- "x^2"; f <- "1=5;2=4;3=3;4=2;5=1" | |
| # representations that work for discrete but not continuous functions: | |
| # as a list of mapped values: list(`1` = 5, `2` = 4, `3` = 3, `4` = 2, `5` = 1) | |
| # as a named vector: c(`1` = 5, `2` = 4, `3` = 3, `4` = 2, `5` = 1) | |
| # ifelse() expressions: f(x == 1, x == 2, x == 3, x == 4, x == 5, .recodes = c(1,2,3,4,5)) | |
| # | |
| # | |
| library("magrittr") | |
| NAfill <- function(x, value) `[<-`(x, is.na(x), value) | |
| below <- function(x, threshold, value) `[<-`(x, x < threshold, value) | |
| above <- function(x, threshold, value) `[<-`(x, x > threshold, value) | |
| at <- function(x, point, value) `[<-`(x, x == point, value) | |
| a %>% NAfill(3) | |
| # [1] 1 2 3 3 2 3 1 4 3 | |
| a %>% below(3, 10) | |
| # [1] 10 10 NA 3 10 NA 10 4 NA | |
| a %>% below(3, 10) %>% NAfill(0) | |
| # [1] 10 10 0 3 10 0 10 4 0 | |
| # the above approach is fine, but doesn't solve the many-to-many aspect of recoding | |
| # by modifying the intervening values, you have to create intermediate stages to avoid overwriting. | |
| a %>% at(3, 10) %>% at(2, 3) %>% at(10, 2) | |
| # could do something like a set of rules that are sequentially evaluated | |
| recode <- function(variable, ...) { | |
| rules <- list(...) | |
| out <- vector(length(variable)) | |
| # check whether rules are a function or if, for example, the domains of the rules overlap | |
| # execute the rules in order, creating a temporary variable | |
| } | |
| recode( | |
| variable = x, | |
| below(2, becomes = 1), | |
| between(2, 4, becomes = 0), | |
| between(4, 8, becomes = 2), | |
| above(8, becomes = NA) | |
| ) | |
| # the `findInterval()` function might be quite useful | |
| # it is implemented in C, so should be quite fast | |
| # it bins a variable (like cut): | |
| cut(1:5, c(0,1,5)) | |
| # except it returns which bin a value falls in: | |
| findInterval(1:5, c(0,1,5)) | |
| # we could use this to specify continuous ranges of input values via cutpoints and | |
| # then define the function for each of those input values | |
| fun <- function(x, cutpoints, recodes) { | |
| n <- length(cutpoints) | |
| out <- x | |
| i <- findInterval(x, cutpoints, rightmost.closed = TRUE, all.inside = FALSE) | |
| out[i != 0 & i < n] <- recodes[i != 0 & i < n] | |
| out | |
| } | |
| fun(-2:6, c(0,2,2.5,5), c("a", "c", "b")) | |
| # [1] "a" "c" "b" "b" NA | |
| stop_for_value <- function(value) { | |
| if (length(value) != 1) { | |
| stop("'value' must be length 1. Only first value used.") | |
| } | |
| } | |
| at <- function(x, value) { | |
| # discrete recode | |
| warn_for_value(value) | |
| structure(list(c(x,x), value), class = "recode_at") | |
| } | |
| from <- function(x, value) { | |
| # lower bound, including `x` | |
| stop_for_value(value) | |
| structure(list(c(x, Inf), value), class = "recode_from") | |
| } | |
| above <- function(x, value) { | |
| # lower bound, excluding `x` | |
| stop_for_value(value) | |
| structure(list(c(x, Inf), value), class = "recode_above") | |
| } | |
| to <- function(x, value) { | |
| # upper bound, including `x` | |
| stop_for_value(value) | |
| structure(list(c(-Inf, x), value), class = "recode_below") | |
| } | |
| below <- function(x, value) { | |
| # upper bound, excluding `x` | |
| stop_for_value(value) | |
| structure(list(c(-Inf, x), value), class = "recode_below") | |
| } | |
| na <- function(value) { | |
| # missing value recode | |
| stop_for_value(value) | |
| structure(list(NA, value[1]), class = "recode_na") | |
| } | |
| inside <- function(x, value) { | |
| # lower and upper bound, including `x` limits | |
| stop_for_value(value) | |
| stopifnot(length(x) == 2) | |
| structure(list(c(x[1], x[2]), value), class = "recode_between") | |
| } | |
| between <- function(x, value) { | |
| # lower and upper bound, excluding `x` limits | |
| stop_for_value(value) | |
| stopifnot(length(x) == 2) | |
| structure(list(c(x[1], x[2]), value), class = "recode_between") | |
| } | |
| of <- function(set, value) { | |
| # discrete recode | |
| stop_for_value(value) | |
| structure(list(set, value), class = "recode_set") | |
| } | |
| recode <- function(x, ...) { | |
| UseMethod("recode") | |
| } | |
| recode.default <- function(x, ...) { | |
| out <- x2 <- x | |
| NAs <- is.na(x) | |
| out[NAs] <- x[NAs] <- -Inf | |
| NAs_handled <- FALSE | |
| a <- list(...) | |
| for (i in seq_along(a)) { | |
| if (inherits(a[[i]], "recode_na")) { | |
| NAs_handled <- TRUE | |
| if (is.function(a[[i]][[2]])) { | |
| out[is.na(x2)] <- a[[i]][[2]](x[is.na(x2)]) | |
| } else { | |
| out[is.na(x2)] <- a[[i]][[2]] | |
| } | |
| } else if (inherits(a[[i]], "recode_set")) { | |
| if (is.function(a[[i]][[2]])) { | |
| out[x %in% a[[i]][[1]]] <- a[[i]][[2]](x %in% a[[i]][[1]]) | |
| } else { | |
| out[x %in% a[[i]][[1]]] <- a[[i]][[2]] | |
| } | |
| } else { | |
| where <- findInterval(x, a[[i]][[1]], rightmost.closed = TRUE, all.inside = FALSE) == 1 | |
| if (is.function(a[[i]][[2]])) { | |
| out[where] <- a[[i]][[2]](x[where]) | |
| } else { | |
| out[where] <- a[[i]][[2]] | |
| } | |
| } | |
| } | |
| if (!NAs_handled) { | |
| out[NAs] <- NA | |
| } | |
| out | |
| } | |
| x <- c(-2:5, NA, 7:10) | |
| square <- function(x) x ^ 2 | |
| cbind(x, recode(x, | |
| at(1, 4), | |
| at(2, 3), | |
| below(0, abs), | |
| na(Inf), | |
| between(c(5,100), -5), | |
| at(9, 6), | |
| of(c(8,10), 20) | |
| )) | |
| recode.character <- function(x, ...) { | |
| out <- x2 <- x | |
| NAs <- is.na(x) | |
| out[NAs] <- "MISSING" | |
| NAs_handled <- FALSE | |
| a <- list(...) | |
| for (i in seq_along(a)) { | |
| if (inherits(a[[i]], "recode_na")) { | |
| NAs_handled <- TRUE | |
| if (is.function(a[[i]][[2]])) { | |
| out[is.na(x2)] <- a[[i]][[2]](x2[is.na(x2)]) | |
| } else { | |
| out[is.na(x2)] <- a[[i]][[2]] | |
| } | |
| } else { | |
| if (is.function(a[[i]][[2]])) { | |
| out[x %in% a[[i]][[1]]] <- a[[i]][[2]](x %in% a[[i]][[1]]) | |
| } else { | |
| out[x %in% a[[i]][[1]]] <- a[[i]][[2]] | |
| } | |
| } | |
| } | |
| if (!NAs_handled) { | |
| out[NAs] <- NA_character_ | |
| } | |
| out | |
| } | |
| cbind(c(letters, NA), recode(c(letters, NA), | |
| na(2), | |
| between(c("g", "m"), "xx"), | |
| of(letters[1:5], "2") | |
| )) | |
| recode.factor <- function(x, ...) { | |
| as.factor(recode(as.character(x))) | |
| } | |
| # visualization | |
| # whatever is decided, I suspect it will be useful to have a tool to visualize the recode | |
| set.seed(1) | |
| x <- rnorm(1e4L) | |
| ## this will need to deal with interval versus discrete | |
| ## discrete values are best plotted using a matrix/image, while interval are better as a scatterplot | |
| plot(x, as.numeric(cut(x, 10)), | |
| main = "Recoding Pattern", | |
| xlab = "Original Variable", ylab = "Recoded Variable", | |
| xaxs = "i", yaxs = "i", | |
| pch = 19, las = 1, col = gray(.7, alpha = .1)) | |
| rug(x, side = 1, col = "red", lwd = 1, line = 0.7) | |
| rug(as.numeric(cut(x, 10)), side = 2, col = "blue", line = 0.7, lwd = 1) | |
| # an alternative visualization scheme would be a river plot: | |
| # http://dantalus.github.io/2014/11/25/river-plots/ | |
| # A recoding DSL could use a function as its first-class object. | |
| # This would allow interesting behavior, like previewing recoding consequences, via `curve()` | |
| # and other other similar behavior | |
| make_recode <- function(env = parent.frame()) { | |
| f <- function(x) { | |
| ifelse(x < 2, x + 1, x - 1) | |
| } | |
| return(f) | |
| } | |
| x <- 1:5 | |
| make_recode()(x) | |
| f <- make_recode() | |
| curve(f, from = 0, to = 4, n = 100) | |
| library("magrittr") | |
| make_recode() %>% curve(from = 0, to = 4, n = 100) | |
| # possibly relevant: https://github.com/larmarange/labelled |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment