Created
March 27, 2015 18:27
-
-
Save ramhiser/4729c049aa0c6bd3c862 to your computer and use it in GitHub Desktop.
Naive imputation of missing data within an R data frame
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#' Naive imputation of missing data | |
#' | |
#' Imputes missing data in a data frame a column at a time, e.g., univariate. | |
#' Missing numeric values are replaced with the median. Similarly, missing | |
#' factor values are replaced with the mode. | |
#' | |
#' If \code{draw} is set to \code{TRUE}, missing data are drawn from a basic | |
#' distribution to make the imputation slightly less naive. For continuous, | |
#' values are drawn from a uniform distribution ranging from the min to max | |
#' values observed within the column. For categorical, values are drawn from a | |
#' multinomial distribution using factor's proportions observed. | |
#' | |
#' @param x data frame | |
#' @param draw If yes, imputed values are drawn based on the data. See details. | |
#' @return data frame with missing values imputed | |
#' @example | |
#' set.seed(42) | |
#' n <- nrow(iris) | |
#' iris_missing <- iris | |
#' iris_missing$Sepal.Length <- replace(iris$Sepal.Length, sample(n, 10), NA) | |
#' iris_missing$Species <- replace(iris$Species, sample(n, 10), NA) | |
#' | |
#' impute_naive(iris_missing) | |
#' impute_naive(iris_missing, draw=TRUE) | |
impute_naive <- function(x, draw=FALSE) { | |
x <- as.data.frame(x) | |
cols_imputed <- lapply(x, function(col) { | |
which_na <- which(is.na(col)) | |
num_na <- length(which_na) | |
if (is.numeric(col)) { | |
if (!draw) { | |
col <- replace(col, is.na(col), median(col, na.rm=TRUE)) | |
} else { | |
col_range <- range(col, na.rm=TRUE) | |
vals_imputed <- runif(n=num_na, min=col_range[1], max=col_range[2]) | |
col <- replace(col, which_na, vals_imputed) | |
} | |
} else { | |
col_levels <- levels(col) | |
if (!draw) { | |
col_mode <- col_levels[which.max(table(col))] | |
col <- replace(col, is.na(col), col_mode) | |
} else { | |
col_proportions <- prop.table(table(col)) | |
vals_imputed <- sample(length(col_levels), num_na, prob=col_proportions, replace=TRUE) | |
col <- replace(col, which_na, col_levels[vals_imputed]) | |
} | |
} | |
col | |
}) | |
do.call(cbind.data.frame, cols_imputed) | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment