ramhiser · March 27, 2015 18:27
diff --git a/impute-naive.r b/impute-naive.r
 #' Naive imputation of missing data
 #'
 #' Imputes missing data in a data frame a column at a time, e.g., univariate.
 #' Missing numeric values are replaced with the median. Similarly, missing
 #' factor values are replaced with the mode.
 #'
 #' If \code{draw} is set to \code{TRUE}, missing data are drawn from a basic
 #' distribution to make the imputation slightly less naive. For continuous,
 #' values are drawn from a uniform distribution ranging from the min to max
 #' values observed within the column. For categorical, values are drawn from a
 #' multinomial distribution using factor's proportions observed.
 #'
 #' @param x data frame
 #' @param draw If yes, imputed values are drawn based on the data. See details.
 #' @return data frame with missing values imputed
 #' @example
 #' set.seed(42)
 #' n <- nrow(iris)
 #' iris_missing <- iris
 #' iris_missing$Sepal.Length <- replace(iris$Sepal.Length, sample(n, 10), NA)
 #' iris_missing$Species <- replace(iris$Species, sample(n, 10), NA)
 #'
 #' impute_naive(iris_missing)
 #' impute_naive(iris_missing, draw=TRUE)
 impute_naive <- function(x, draw=FALSE) {
  x <- as.data.frame(x)

  cols_imputed <- lapply(x, function(col) {
    which_na <- which(is.na(col))
    num_na <- length(which_na)

    if (is.numeric(col)) {
      if (!draw) {
        col <- replace(col, is.na(col), median(col, na.rm=TRUE))
      } else {
        col_range <- range(col, na.rm=TRUE)
        vals_imputed <- runif(n=num_na, min=col_range[1], max=col_range[2])
        col <- replace(col, which_na, vals_imputed)
      }
    } else {
      col_levels <- levels(col)
      if (!draw) {
        col_mode <- col_levels[which.max(table(col))]
        col <- replace(col, is.na(col), col_mode)
      } else {
        col_proportions <- prop.table(table(col))
        vals_imputed <- sample(length(col_levels), num_na, prob=col_proportions, replace=TRUE)
        col <- replace(col, which_na, col_levels[vals_imputed])
      }
    }

    col
  })

  do.call(cbind.data.frame, cols_imputed)
 }
	#' Naive imputation of missing data
	#'
	#' Imputes missing data in a data frame a column at a time, e.g., univariate.
	#' Missing numeric values are replaced with the median. Similarly, missing
	#' factor values are replaced with the mode.
	#'
	#' If \code{draw} is set to \code{TRUE}, missing data are drawn from a basic
	#' distribution to make the imputation slightly less naive. For continuous,
	#' values are drawn from a uniform distribution ranging from the min to max
	#' values observed within the column. For categorical, values are drawn from a
	#' multinomial distribution using factor's proportions observed.
	#'
	#' @param x data frame
	#' @param draw If yes, imputed values are drawn based on the data. See details.
	#' @return data frame with missing values imputed
	#' @example
	#' set.seed(42)
	#' n <- nrow(iris)
	#' iris_missing <- iris
	#' iris_missing$Sepal.Length <- replace(iris$Sepal.Length, sample(n, 10), NA)
	#' iris_missing$Species <- replace(iris$Species, sample(n, 10), NA)
	#'
	#' impute_naive(iris_missing)
	#' impute_naive(iris_missing, draw=TRUE)
	impute_naive <- function(x, draw=FALSE) {
	x <- as.data.frame(x)

	cols_imputed <- lapply(x, function(col) {
	which_na <- which(is.na(col))
	num_na <- length(which_na)

	if (is.numeric(col)) {
	if (!draw) {
	col <- replace(col, is.na(col), median(col, na.rm=TRUE))
	} else {
	col_range <- range(col, na.rm=TRUE)
	vals_imputed <- runif(n=num_na, min=col_range[1], max=col_range[2])
	col <- replace(col, which_na, vals_imputed)
	}
	} else {
	col_levels <- levels(col)
	if (!draw) {
	col_mode <- col_levels[which.max(table(col))]
	col <- replace(col, is.na(col), col_mode)
	} else {
	col_proportions <- prop.table(table(col))
	vals_imputed <- sample(length(col_levels), num_na, prob=col_proportions, replace=TRUE)
	col <- replace(col, which_na, col_levels[vals_imputed])
	}
	}

	col
	})

	do.call(cbind.data.frame, cols_imputed)
	}