Created
July 5, 2012 21:37
-
-
Save aaronwolen/3056626 to your computer and use it in GitHub Desktop.
dataframetools: A few simple functions for performing simple tasks with data.frames
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# dataframetools | |
# A few simple functions for performing simple tasks with data.frames | |
# --- | |
# Includes functions for: | |
# | |
# - reordering data.frames | |
# - identifying invariant or blank columns | |
# - identifying groups of columns that are redundant with each other | |
# - converting all columns of class factor to class character | |
#' Easily reorder columns | |
#' | |
#' @param df data frame to modify | |
#' @param order named numeric vector, numbers indicate new location of named column | |
#' | |
#' @examples | |
#' reorder_cols(mtcars, c("hp" = 1, "mpg" = 6)) | |
reorder_cols <- function(df, order) { | |
old_order <- names(df) | |
new_order <- rep(NA, length(old_order)) | |
new_order[order] <- names(order) | |
new_order[is.na(new_order)] <- setdiff(old_order, names(order)) | |
return(df[, new_order]) | |
} | |
#' Identify invariant columns | |
#' | |
#' @param data data.frame or matrix | |
#' | |
#' @examples | |
#' x <- data.frame(a = letters[1:10], b = "a") | |
#' invariant_cols(x) | |
invariant_cols <- function(df) { | |
# Convert blanks to NA | |
df <- replace(df, df == "", NA) | |
col.levels <- apply(df, M = 2, function(x) | |
length(unique(as.character(x)))) | |
return(names(col.levels)[col.levels == 1]) | |
} | |
#' Identify groups of redundant columns | |
#' | |
#' @param df data.frame | |
#' | |
#' @return list, each element of which contains labels of redundant columns | |
#' | |
#' @examples | |
#' x <- mtcars | |
#' x$mpg2 <- mtcars$mpg | |
#' x$mpg3 <- mtcars$mpg | |
#' x$disp2 <- mtcars$disp | |
#' redundant_cols(x) | |
redundant_cols <- function(df) { | |
# Identify all possible column comparisons | |
cols <- matrix(names(df)[t(combn(ncol(df), 2))], ncol = 2) | |
# Check for identical columns | |
redundant <- apply(cols, 1, function(x) identical(df[, x[1]], df[, x[2]])) | |
if(!any(redundant)) { | |
return(NULL) | |
} | |
# Edge-list of redundant columns | |
hits <- matrix(cols[redundant,], ncol = 2) | |
# Split into groups of redundant variables | |
# (all strings in each element correspond to columns with identical content) | |
groups <- list(hits[1,]) | |
hits <- hits[-1,] | |
while(nrow(hits) > 0) { | |
i <- length(groups) | |
# Any other rows overlap with current group? | |
row.hits <- apply(hits, 1, function(row) any(row %in% groups[[i]])) | |
if( any(row.hits) ) { | |
groups[[i]] <- unique(c(groups[[i]], hits[row.hits,])) | |
hits <- matrix(hits[!row.hits, ], ncol = 2) | |
} else { | |
groups[[i + 1]] <- as.character(hits[1, ]) | |
hits <- matrix(hits[-1, ], ncol = 2) | |
} | |
} | |
return(groups) | |
} | |
#' Convert factor columns to character vectors | |
#' | |
#' @param df data.frame | |
#' | |
#' @examples | |
#' iris <- defactor(iris) | |
#' class(iris$Species) | |
defactor <- function(df) { | |
stopifnot(class(df) == "data.frame") | |
df.l <- as.list(df) | |
f.cols <- unlist(lapply(df.l, is.factor)) | |
f.cols <- names(f.cols)[f.cols] | |
for(c in f.cols) { | |
df[, c] <- as.character(df[, c]) | |
} | |
return(df) | |
} | |
#' Set class of specified columns to either numeric or factor. The class of all | |
#' columns not specified in num.cols or fac.cols will be set to character. | |
#' | |
#' @param df data.frame | |
#' @param num.cols columns that should be of class numeric | |
#' @param fac.cols columns that should be of class factor | |
#' | |
classify_columns <- function(df, num.cols, fac.cols) { | |
stopifnot(class(df) == "data.frame") | |
df <- as.list(df) | |
if(!missing(fac.cols)) { | |
stopifnot(all(fac.cols %in% names(df))) | |
df[!names(df) %in% fac.cols] <- lapply(df[!names(df) %in% fac.cols], as.character) | |
df[fac.cols] <- lapply(df[fac.cols], as.factor) | |
} else { | |
df <- lapply(df, as.character) | |
} | |
if(!missing(num.cols)) { | |
stopifnot(all(num.cols %in% names(df))) | |
df[num.cols] <- lapply(df[num.cols], as.numeric) | |
} | |
return(as.data.frame(df, stringsAsFactors = FALSE)) | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment