leeper · April 3, 2017 12:52
diff --git a/recoding.R b/recoding.R
 # recoding is basically a function
 # x is a variable
 # f() is a function
 # f(x) is a recoding operation
 # 
 # implementation
 # if x is discrete, then f() is simple to implement as a map between input values and output values:
  # 1 = 0
  # 2 = 1
 # 
 # if x is continuous and f() is a simple transformation, it can also be simple, e.g.:
  # f(x) = x^2
  # f(x) = (x - 1)/2
 #
 # a major complication, however, is that R vectors have missing data, requiring all functions to either:
  # a) have a discrete argument to specify the translation of NAs
  # b) strictly pass missing values
 # 
 #
 # we have lots of specialized language for specific coding operations
 # - top-coding: a function that plateaus above a given point in its domain
 # - bottom-coding: a function that plateaus below a given point in its domain
 # - reverse coding: a function replaces each value in a vector with the corresponding value in the reversed vector
 # - binning: a step function (that converts a continuous vector into a categorical vector
 # - relabeling: a function that takes discrete categories as input and returns other discrete categories
 # - centering: aligning a vector so that its central tendency is (typically) 0
 # - scaling: a function of the form f(x) = a*x
 # - imputation: a function f(x) = x, except for the handling of missing values according to a rule
 #
 #
 # useful distinction?
 # - dimension reducing recode (`unbranch()`): collapse or recode multiple variables into one variable
 # - dimension expanding recode (`branch()`): expand one or more variables into more variables
 #
 #
 # An example: reverse coding in R
 x <- 1:5

 ifelse(x == 1, 5, ifelse(x == 2, 4, ifelse(x == 3, 3, ifelse(x == 4, 2, ifelse(x == 5, 1, NA)))))

 -x + (max(x)+min(x))

 x2 <- x
 x2[x == 1] <- 5
 x2[x == 2] <- 4
 x2[x == 3] <- 3
 x2[x == 4] <- 2
 x2[x == 5] <- 1
 
 s <- scale(x, center = TRUE, scale = FALSE)
 -s[,1] + attributes(s)[["scaled:center"]]

 car::recode(x, "1=5;2=4;3=3;4=2;5=1")



 # better ways to do this?
 # the question from an R context is how to express a recoding function compactly
 #
 # car::recode() is quite compact, but it is optimized for discrete values not continuous variables
 # it also has terrible memory usage:
 x <- 1:5
 tracemem(x)
 car::recode(x, "1=5;2=4;3=3;4=2;5=1")
 ## tracemem[0x000000000a3cf640 -> 0x000000000a3cb500]: <Anonymous> 
 ## tracemem[0x000000000a3cf640 -> 0x000000000a034458]: <Anonymous> 
 ## tracemem[0x000000000a3cb500 -> 0x000000000a017948]: <Anonymous> 
 ## tracemem[0x000000000a3cf640 -> 0x0000000009fdd0e0]: <Anonymous> 
 ## tracemem[0x000000000a3cf640 -> 0x0000000009fa21f8]: <Anonymous> 
 ## tracemem[0x000000000a3cf640 -> 0x0000000009cca190]: <Anonymous> 
 ## tracemem[0x000000000a3cf640 -> 0x0000000009bc4da8]: <Anonymous>
 #
 #
 # how do other languages do this?
 # Stata has two flavors of recoding
 # the first is `generate` and `replace`
 # gen x = .
 # replace x = 1 if (conditions)
 #
 # that is analogous to `x[logical] <- value` notation in R
 # The other flavor is `generate` and `recode`
 # gen x2 = x
 # recode x2 (min/5=1) (6=2) (7/10=3) (11/max=4)
 #
 # SPSS has the same two options:
 # recode x (lo thru 5=1) (6=2) (7 thru 10=3) (11 thru hi=4)
 # execute.
 # 
 # compute mpg3 = 1.
 # if (mpg >= 19) & (mpg <= 23)  mpg3 = 2.
 # if (mpg >= 24) & (mpg <= 100) mpg3 = 3.
 # execute.
 #
 # pandas copies R implementation
 #
 #
 #
 # there is also the complication of R vector classes
 # the domain of each major class (logical, integer, numeric, character, factor) differs
 # this means that recoding functions should be S3 generics
 #
 #
 # how should character class vectors be treated?
 # there's no finite domain for the recoding function
 # how then should any checking be performed?
 # how should the recodes be performed? Lookup in a hash table? regex?
 #
 #
 # factors require special care because they're treated as integer but display as character
 # thus why they have recoding-like functions (e.g., `relevel()`) already
 #
 # another problem is that the domain of a recoding function might not be fully represented in
 # the actual data vector that is being recoded. For example, the domain of a recode function
 # might be 1,2,3,4,5, but the data vector being recoded might only contain 1,3,4
 # this means that using empirical values in the data vector to create the recoding function 
 # (e.g., max(x)) will be wrong. We need to be able to create a recoding function independent of
 # the actual data vector being recoded.
 #
 # there are various ways to define a function:
  
  # representations that work better for continuous than discrete functions
  # as an actual R expression: f <- function(x) { x^2 }; f <- function(x) { -x + (max(x)+min(x)) }
  # as a string representation: f <- "x^2"; f <- "1=5;2=4;3=3;4=2;5=1"
  
  # representations that work for discrete but not continuous functions:
  # as a list of mapped values: list(`1` = 5, `2` = 4, `3` = 3, `4` = 2, `5` = 1)
  # as a named vector: c(`1` = 5, `2` = 4, `3` = 3, `4` = 2, `5` = 1)
  # ifelse() expressions: f(x == 1, x == 2, x == 3, x == 4, x == 5, .recodes = c(1,2,3,4,5))
 #
 #

 library("magrittr")
 NAfill <- function(x, value) `[<-`(x, is.na(x), value)
 below <- function(x, threshold, value) `[<-`(x, x < threshold, value)
 above <- function(x, threshold, value) `[<-`(x, x > threshold, value)
 at <- function(x, point, value) `[<-`(x, x == point, value)

 a %>% NAfill(3)
 # [1] 1 2 3 3 2 3 1 4 3
 a %>% below(3, 10)
 # [1] 10 10 NA  3 10 NA 10  4 NA
 a %>% below(3, 10) %>% NAfill(0)
 # [1] 10 10  0  3 10  0 10  4  0

 # the above approach is fine, but doesn't solve the many-to-many aspect of recoding
 # by modifying the intervening values, you have to create intermediate stages to avoid overwriting.
 a %>% at(3, 10) %>% at(2, 3) %>% at(10, 2)



 # could do something like a set of rules that are sequentially evaluated
 recode <- function(variable, ...) { 
  rules <- list(...)
  out <- vector(length(variable))
  # check whether rules are a function or if, for example, the domains of the rules overlap
  # execute the rules in order, creating a temporary variable
 }
 recode(
  variable = x,
  below(2, becomes = 1),
  between(2, 4, becomes = 0),
  between(4, 8, becomes = 2),
  above(8, becomes = NA)
 )

 # the `findInterval()` function might be quite useful
 # it is implemented in C, so should be quite fast
 # it bins a variable (like cut):
 cut(1:5, c(0,1,5))
 # except it returns which bin a value falls in:
 findInterval(1:5, c(0,1,5))
 # we could use this to specify continuous ranges of input values via cutpoints and 
 # then define the function for each of those input values
 fun <- function(x, cutpoints, recodes) {
  n <- length(cutpoints)
  out <- x
  i <- findInterval(x, cutpoints, rightmost.closed = TRUE, all.inside = FALSE)
  out[i != 0 & i < n] <- recodes[i != 0 & i < n]
  out
 }
 fun(-2:6, c(0,2,2.5,5), c("a", "c", "b"))
 # [1] "a" "c" "b" "b" NA

 stop_for_value <- function(value) {
  if (length(value) != 1) {
    stop("'value' must be length 1. Only first value used.")
  }
 }
 at <- function(x, value) {
  # discrete recode
  warn_for_value(value)
  structure(list(c(x,x), value), class = "recode_at")
 }
 from <- function(x, value) {
  # lower bound, including `x`
  stop_for_value(value)
  structure(list(c(x, Inf), value), class = "recode_from")
 }
 above <- function(x, value) {
  # lower bound, excluding `x`
  stop_for_value(value)
  structure(list(c(x, Inf), value), class = "recode_above")
 }
 to <- function(x, value) {
  # upper bound, including `x`
  stop_for_value(value)
  structure(list(c(-Inf, x), value), class = "recode_below")
 }
 below <- function(x, value) {
  # upper bound, excluding `x`
  stop_for_value(value)
  structure(list(c(-Inf, x), value), class = "recode_below")
 }
 na <- function(value) {
  # missing value recode
  stop_for_value(value)
  structure(list(NA, value[1]), class = "recode_na")
 }
 inside <- function(x, value) {
  # lower and upper bound, including `x` limits
  stop_for_value(value)
  stopifnot(length(x) == 2)
  structure(list(c(x[1], x[2]), value), class = "recode_between")
 }
 between <- function(x, value) {
  # lower and upper bound, excluding `x` limits
  stop_for_value(value)
  stopifnot(length(x) == 2)
  structure(list(c(x[1], x[2]), value), class = "recode_between")
 }
 of <- function(set, value) {
  # discrete recode
  stop_for_value(value)
  structure(list(set, value), class = "recode_set")
 }

 recode <- function(x, ...) {
  UseMethod("recode")
 }

 recode.default <- function(x, ...) {
  out <- x2 <- x
  NAs <- is.na(x)
  out[NAs] <- x[NAs] <- -Inf
  NAs_handled <- FALSE
  a <- list(...)
  for (i in seq_along(a)) {
    if (inherits(a[[i]], "recode_na")) {
      NAs_handled <- TRUE
      if (is.function(a[[i]][[2]])) {
        out[is.na(x2)] <- a[[i]][[2]](x[is.na(x2)])
      } else {
        out[is.na(x2)] <- a[[i]][[2]]
      }
    } else if (inherits(a[[i]], "recode_set")) {
      if (is.function(a[[i]][[2]])) {
        out[x %in% a[[i]][[1]]] <- a[[i]][[2]](x %in% a[[i]][[1]])
      } else {
        out[x %in% a[[i]][[1]]] <- a[[i]][[2]]
      }
    } else {
      where <- findInterval(x, a[[i]][[1]], rightmost.closed = TRUE, all.inside = FALSE) == 1
      if (is.function(a[[i]][[2]])) {
        out[where] <- a[[i]][[2]](x[where])
      } else {
        out[where] <- a[[i]][[2]]
      }
    }
  }
  if (!NAs_handled) {
    out[NAs] <- NA
  }
  out
 }
 x <- c(-2:5, NA, 7:10)
 square <- function(x) x ^ 2
 cbind(x, recode(x,
  at(1, 4),
  at(2, 3),
  below(0, abs),
  na(Inf),
  between(c(5,100), -5),
  at(9, 6),
  of(c(8,10), 20)
 ))

 recode.character <- function(x, ...) {
  out <- x2 <- x
  NAs <- is.na(x)
  out[NAs] <- "MISSING"
  NAs_handled <- FALSE
  a <- list(...)
  for (i in seq_along(a)) {
    if (inherits(a[[i]], "recode_na")) {
      NAs_handled <- TRUE
      if (is.function(a[[i]][[2]])) {
        out[is.na(x2)] <- a[[i]][[2]](x2[is.na(x2)])
      } else {
        out[is.na(x2)] <- a[[i]][[2]]
      }
    } else {
      if (is.function(a[[i]][[2]])) {
        out[x %in% a[[i]][[1]]] <- a[[i]][[2]](x %in% a[[i]][[1]])
      } else {
        out[x %in% a[[i]][[1]]] <- a[[i]][[2]]
      }
    }
  }
  if (!NAs_handled) {
    out[NAs] <- NA_character_
  }
  out
 }

 cbind(c(letters, NA), recode(c(letters, NA),
  na(2),
  between(c("g", "m"), "xx"),
  of(letters[1:5], "2")
 ))

 recode.factor <- function(x, ...) {
  as.factor(recode(as.character(x)))
 }


 # visualization
 # whatever is decided, I suspect it will be useful to have a tool to visualize the recode
 set.seed(1)
 x <- rnorm(1e4L)
 ## this will need to deal with interval versus discrete
 ## discrete values are best plotted using a matrix/image, while interval are better as a scatterplot
 plot(x, as.numeric(cut(x, 10)), 
     main = "Recoding Pattern",
     xlab = "Original Variable", ylab = "Recoded Variable",
     xaxs = "i", yaxs = "i",
     pch = 19, las = 1, col = gray(.7, alpha = .1))
 rug(x, side = 1, col = "red", lwd = 1, line = 0.7)
 rug(as.numeric(cut(x, 10)), side = 2, col = "blue", line = 0.7, lwd = 1)

 # an alternative visualization scheme would be a river plot:
 # http://dantalus.github.io/2014/11/25/river-plots/


 # A recoding DSL could use a function as its first-class object.
 # This would allow interesting behavior, like previewing recoding consequences, via `curve()`
 # and other other similar behavior
 make_recode <- function(env = parent.frame()) { 
  f <- function(x) {
    ifelse(x < 2, x + 1, x - 1)
  }
  return(f)
 }

 x <- 1:5
 make_recode()(x)
 f <- make_recode()
 curve(f, from = 0, to = 4, n = 100)

 library("magrittr")
 make_recode() %>% curve(from = 0, to = 4, n = 100)



 # possibly relevant: https://github.com/larmarange/labelled
	# recoding is basically a function
	# x is a variable
	# f() is a function
	# f(x) is a recoding operation
	#
	# implementation
	# if x is discrete, then f() is simple to implement as a map between input values and output values:
	# 1 = 0
	# 2 = 1
	#
	# if x is continuous and f() is a simple transformation, it can also be simple, e.g.:
	# f(x) = x^2
	# f(x) = (x - 1)/2
	#
	# a major complication, however, is that R vectors have missing data, requiring all functions to either:
	# a) have a discrete argument to specify the translation of NAs
	# b) strictly pass missing values
	#
	#
	# we have lots of specialized language for specific coding operations
	# - top-coding: a function that plateaus above a given point in its domain
	# - bottom-coding: a function that plateaus below a given point in its domain
	# - reverse coding: a function replaces each value in a vector with the corresponding value in the reversed vector
	# - binning: a step function (that converts a continuous vector into a categorical vector
	# - relabeling: a function that takes discrete categories as input and returns other discrete categories
	# - centering: aligning a vector so that its central tendency is (typically) 0
	# - scaling: a function of the form f(x) = a*x
	# - imputation: a function f(x) = x, except for the handling of missing values according to a rule
	#
	#
	# useful distinction?
	# - dimension reducing recode (`unbranch()`): collapse or recode multiple variables into one variable
	# - dimension expanding recode (`branch()`): expand one or more variables into more variables
	#
	#
	# An example: reverse coding in R
	x <- 1:5

	ifelse(x == 1, 5, ifelse(x == 2, 4, ifelse(x == 3, 3, ifelse(x == 4, 2, ifelse(x == 5, 1, NA)))))

	-x + (max(x)+min(x))

	x2 <- x
	x2[x == 1] <- 5
	x2[x == 2] <- 4
	x2[x == 3] <- 3
	x2[x == 4] <- 2
	x2[x == 5] <- 1

	s <- scale(x, center = TRUE, scale = FALSE)
	-s[,1] + attributes(s)[["scaled:center"]]

	car::recode(x, "1=5;2=4;3=3;4=2;5=1")



	# better ways to do this?
	# the question from an R context is how to express a recoding function compactly
	#
	# car::recode() is quite compact, but it is optimized for discrete values not continuous variables
	# it also has terrible memory usage:
	x <- 1:5
	tracemem(x)
	car::recode(x, "1=5;2=4;3=3;4=2;5=1")
	## tracemem[0x000000000a3cf640 -> 0x000000000a3cb500]: <Anonymous>
	## tracemem[0x000000000a3cf640 -> 0x000000000a034458]: <Anonymous>
	## tracemem[0x000000000a3cb500 -> 0x000000000a017948]: <Anonymous>
	## tracemem[0x000000000a3cf640 -> 0x0000000009fdd0e0]: <Anonymous>
	## tracemem[0x000000000a3cf640 -> 0x0000000009fa21f8]: <Anonymous>
	## tracemem[0x000000000a3cf640 -> 0x0000000009cca190]: <Anonymous>
	## tracemem[0x000000000a3cf640 -> 0x0000000009bc4da8]: <Anonymous>
	#
	#
	# how do other languages do this?
	# Stata has two flavors of recoding
	# the first is `generate` and `replace`
	# gen x = .
	# replace x = 1 if (conditions)
	#
	# that is analogous to `x[logical] <- value` notation in R
	# The other flavor is `generate` and `recode`
	# gen x2 = x
	# recode x2 (min/5=1) (6=2) (7/10=3) (11/max=4)
	#
	# SPSS has the same two options:
	# recode x (lo thru 5=1) (6=2) (7 thru 10=3) (11 thru hi=4)
	# execute.
	#
	# compute mpg3 = 1.
	# if (mpg >= 19) & (mpg <= 23) mpg3 = 2.
	# if (mpg >= 24) & (mpg <= 100) mpg3 = 3.
	# execute.
	#
	# pandas copies R implementation
	#
	#
	#
	# there is also the complication of R vector classes
	# the domain of each major class (logical, integer, numeric, character, factor) differs
	# this means that recoding functions should be S3 generics
	#
	#
	# how should character class vectors be treated?
	# there's no finite domain for the recoding function
	# how then should any checking be performed?
	# how should the recodes be performed? Lookup in a hash table? regex?
	#
	#
	# factors require special care because they're treated as integer but display as character
	# thus why they have recoding-like functions (e.g., `relevel()`) already
	#
	# another problem is that the domain of a recoding function might not be fully represented in
	# the actual data vector that is being recoded. For example, the domain of a recode function
	# might be 1,2,3,4,5, but the data vector being recoded might only contain 1,3,4
	# this means that using empirical values in the data vector to create the recoding function
	# (e.g., max(x)) will be wrong. We need to be able to create a recoding function independent of
	# the actual data vector being recoded.
	#
	# there are various ways to define a function:

	# representations that work better for continuous than discrete functions
	# as an actual R expression: f <- function(x) { x^2 }; f <- function(x) { -x + (max(x)+min(x)) }
	# as a string representation: f <- "x^2"; f <- "1=5;2=4;3=3;4=2;5=1"

	# representations that work for discrete but not continuous functions:
	# as a list of mapped values: list(`1` = 5, `2` = 4, `3` = 3, `4` = 2, `5` = 1)
	# as a named vector: c(`1` = 5, `2` = 4, `3` = 3, `4` = 2, `5` = 1)
	# ifelse() expressions: f(x == 1, x == 2, x == 3, x == 4, x == 5, .recodes = c(1,2,3,4,5))
	#
	#

	library("magrittr")
	NAfill <- function(x, value) `[<-`(x, is.na(x), value)
	below <- function(x, threshold, value) `[<-`(x, x < threshold, value)
	above <- function(x, threshold, value) `[<-`(x, x > threshold, value)
	at <- function(x, point, value) `[<-`(x, x == point, value)

	a %>% NAfill(3)
	# [1] 1 2 3 3 2 3 1 4 3
	a %>% below(3, 10)
	# [1] 10 10 NA 3 10 NA 10 4 NA
	a %>% below(3, 10) %>% NAfill(0)
	# [1] 10 10 0 3 10 0 10 4 0

	# the above approach is fine, but doesn't solve the many-to-many aspect of recoding
	# by modifying the intervening values, you have to create intermediate stages to avoid overwriting.
	a %>% at(3, 10) %>% at(2, 3) %>% at(10, 2)



	# could do something like a set of rules that are sequentially evaluated
	recode <- function(variable, ...) {
	rules <- list(...)
	out <- vector(length(variable))
	# check whether rules are a function or if, for example, the domains of the rules overlap
	# execute the rules in order, creating a temporary variable
	}
	recode(
	variable = x,
	below(2, becomes = 1),
	between(2, 4, becomes = 0),
	between(4, 8, becomes = 2),
	above(8, becomes = NA)
	)

	# the `findInterval()` function might be quite useful
	# it is implemented in C, so should be quite fast
	# it bins a variable (like cut):
	cut(1:5, c(0,1,5))
	# except it returns which bin a value falls in:
	findInterval(1:5, c(0,1,5))
	# we could use this to specify continuous ranges of input values via cutpoints and
	# then define the function for each of those input values
	fun <- function(x, cutpoints, recodes) {
	n <- length(cutpoints)
	out <- x
	i <- findInterval(x, cutpoints, rightmost.closed = TRUE, all.inside = FALSE)
	out[i != 0 & i < n] <- recodes[i != 0 & i < n]
	out
	}
	fun(-2:6, c(0,2,2.5,5), c("a", "c", "b"))
	# [1] "a" "c" "b" "b" NA

	stop_for_value <- function(value) {
	if (length(value) != 1) {
	stop("'value' must be length 1. Only first value used.")
	}
	}
	at <- function(x, value) {
	# discrete recode
	warn_for_value(value)
	structure(list(c(x,x), value), class = "recode_at")
	}
	from <- function(x, value) {
	# lower bound, including `x`
	stop_for_value(value)
	structure(list(c(x, Inf), value), class = "recode_from")
	}
	above <- function(x, value) {
	# lower bound, excluding `x`
	stop_for_value(value)
	structure(list(c(x, Inf), value), class = "recode_above")
	}
	to <- function(x, value) {
	# upper bound, including `x`
	stop_for_value(value)
	structure(list(c(-Inf, x), value), class = "recode_below")
	}
	below <- function(x, value) {
	# upper bound, excluding `x`
	stop_for_value(value)
	structure(list(c(-Inf, x), value), class = "recode_below")
	}
	na <- function(value) {
	# missing value recode
	stop_for_value(value)
	structure(list(NA, value[1]), class = "recode_na")
	}
	inside <- function(x, value) {
	# lower and upper bound, including `x` limits
	stop_for_value(value)
	stopifnot(length(x) == 2)
	structure(list(c(x[1], x[2]), value), class = "recode_between")
	}
	between <- function(x, value) {
	# lower and upper bound, excluding `x` limits
	stop_for_value(value)
	stopifnot(length(x) == 2)
	structure(list(c(x[1], x[2]), value), class = "recode_between")
	}
	of <- function(set, value) {
	# discrete recode
	stop_for_value(value)
	structure(list(set, value), class = "recode_set")
	}

	recode <- function(x, ...) {
	UseMethod("recode")
	}

	recode.default <- function(x, ...) {
	out <- x2 <- x
	NAs <- is.na(x)
	out[NAs] <- x[NAs] <- -Inf
	NAs_handled <- FALSE
	a <- list(...)
	for (i in seq_along(a)) {
	if (inherits(a[[i]], "recode_na")) {
	NAs_handled <- TRUE
	if (is.function(a[[i]][[2]])) {
	out[is.na(x2)] <- a[[i]][[2]](x[is.na(x2)])
	} else {
	out[is.na(x2)] <- a[[i]][[2]]
	}
	} else if (inherits(a[[i]], "recode_set")) {
	if (is.function(a[[i]][[2]])) {
	out[x %in% a[[i]][[1]]] <- a[[i]][[2]](x %in% a[[i]][[1]])
	} else {
	out[x %in% a[[i]][[1]]] <- a[[i]][[2]]
	}
	} else {
	where <- findInterval(x, a[[i]][[1]], rightmost.closed = TRUE, all.inside = FALSE) == 1
	if (is.function(a[[i]][[2]])) {
	out[where] <- a[[i]][[2]](x[where])
	} else {
	out[where] <- a[[i]][[2]]
	}
	}
	}
	if (!NAs_handled) {
	out[NAs] <- NA
	}
	out
	}
	x <- c(-2:5, NA, 7:10)
	square <- function(x) x ^ 2
	cbind(x, recode(x,
	at(1, 4),
	at(2, 3),
	below(0, abs),
	na(Inf),
	between(c(5,100), -5),
	at(9, 6),
	of(c(8,10), 20)
	))

	recode.character <- function(x, ...) {
	out <- x2 <- x
	NAs <- is.na(x)
	out[NAs] <- "MISSING"
	NAs_handled <- FALSE
	a <- list(...)
	for (i in seq_along(a)) {
	if (inherits(a[[i]], "recode_na")) {
	NAs_handled <- TRUE
	if (is.function(a[[i]][[2]])) {
	out[is.na(x2)] <- a[[i]][[2]](x2[is.na(x2)])
	} else {
	out[is.na(x2)] <- a[[i]][[2]]
	}
	} else {
	if (is.function(a[[i]][[2]])) {
	out[x %in% a[[i]][[1]]] <- a[[i]][[2]](x %in% a[[i]][[1]])
	} else {
	out[x %in% a[[i]][[1]]] <- a[[i]][[2]]
	}
	}
	}
	if (!NAs_handled) {
	out[NAs] <- NA_character_
	}
	out
	}

	cbind(c(letters, NA), recode(c(letters, NA),
	na(2),
	between(c("g", "m"), "xx"),
	of(letters[1:5], "2")
	))

	recode.factor <- function(x, ...) {
	as.factor(recode(as.character(x)))
	}


	# visualization
	# whatever is decided, I suspect it will be useful to have a tool to visualize the recode
	set.seed(1)
	x <- rnorm(1e4L)
	## this will need to deal with interval versus discrete
	## discrete values are best plotted using a matrix/image, while interval are better as a scatterplot
	plot(x, as.numeric(cut(x, 10)),
	main = "Recoding Pattern",
	xlab = "Original Variable", ylab = "Recoded Variable",
	xaxs = "i", yaxs = "i",
	pch = 19, las = 1, col = gray(.7, alpha = .1))
	rug(x, side = 1, col = "red", lwd = 1, line = 0.7)
	rug(as.numeric(cut(x, 10)), side = 2, col = "blue", line = 0.7, lwd = 1)

	# an alternative visualization scheme would be a river plot:
	# http://dantalus.github.io/2014/11/25/river-plots/


	# A recoding DSL could use a function as its first-class object.
	# This would allow interesting behavior, like previewing recoding consequences, via `curve()`
	# and other other similar behavior
	make_recode <- function(env = parent.frame()) {
	f <- function(x) {
	ifelse(x < 2, x + 1, x - 1)
	}
	return(f)
	}

	x <- 1:5
	make_recode()(x)
	f <- make_recode()
	curve(f, from = 0, to = 4, n = 100)

	library("magrittr")
	make_recode() %>% curve(from = 0, to = 4, n = 100)



	# possibly relevant: https://github.com/larmarange/labelled
No results found