Skip to content

Instantly share code, notes, and snippets.

@aaronwolen
Created July 5, 2012 21:37
Show Gist options
  • Save aaronwolen/3056626 to your computer and use it in GitHub Desktop.
Save aaronwolen/3056626 to your computer and use it in GitHub Desktop.
dataframetools: A few simple functions for performing simple tasks with data.frames
# dataframetools
# A few simple functions for performing simple tasks with data.frames
# ---
# Includes functions for:
#
# - reordering data.frames
# - identifying invariant or blank columns
# - identifying groups of columns that are redundant with each other
# - converting all columns of class factor to class character
#' Easily reorder columns
#'
#' @param df data frame to modify
#' @param order named numeric vector, numbers indicate new location of named column
#'
#' @examples
#' reorder_cols(mtcars, c("hp" = 1, "mpg" = 6))
reorder_cols <- function(df, order) {
old_order <- names(df)
new_order <- rep(NA, length(old_order))
new_order[order] <- names(order)
new_order[is.na(new_order)] <- setdiff(old_order, names(order))
return(df[, new_order])
}
#' Identify invariant columns
#'
#' @param data data.frame or matrix
#'
#' @examples
#' x <- data.frame(a = letters[1:10], b = "a")
#' invariant_cols(x)
invariant_cols <- function(df) {
# Convert blanks to NA
df <- replace(df, df == "", NA)
col.levels <- apply(df, M = 2, function(x)
length(unique(as.character(x))))
return(names(col.levels)[col.levels == 1])
}
#' Identify groups of redundant columns
#'
#' @param df data.frame
#'
#' @return list, each element of which contains labels of redundant columns
#'
#' @examples
#' x <- mtcars
#' x$mpg2 <- mtcars$mpg
#' x$mpg3 <- mtcars$mpg
#' x$disp2 <- mtcars$disp
#' redundant_cols(x)
redundant_cols <- function(df) {
# Identify all possible column comparisons
cols <- matrix(names(df)[t(combn(ncol(df), 2))], ncol = 2)
# Check for identical columns
redundant <- apply(cols, 1, function(x) identical(df[, x[1]], df[, x[2]]))
if(!any(redundant)) {
return(NULL)
}
# Edge-list of redundant columns
hits <- matrix(cols[redundant,], ncol = 2)
# Split into groups of redundant variables
# (all strings in each element correspond to columns with identical content)
groups <- list(hits[1,])
hits <- hits[-1,]
while(nrow(hits) > 0) {
i <- length(groups)
# Any other rows overlap with current group?
row.hits <- apply(hits, 1, function(row) any(row %in% groups[[i]]))
if( any(row.hits) ) {
groups[[i]] <- unique(c(groups[[i]], hits[row.hits,]))
hits <- matrix(hits[!row.hits, ], ncol = 2)
} else {
groups[[i + 1]] <- as.character(hits[1, ])
hits <- matrix(hits[-1, ], ncol = 2)
}
}
return(groups)
}
#' Convert factor columns to character vectors
#'
#' @param df data.frame
#'
#' @examples
#' iris <- defactor(iris)
#' class(iris$Species)
defactor <- function(df) {
stopifnot(class(df) == "data.frame")
df.l <- as.list(df)
f.cols <- unlist(lapply(df.l, is.factor))
f.cols <- names(f.cols)[f.cols]
for(c in f.cols) {
df[, c] <- as.character(df[, c])
}
return(df)
}
#' Set class of specified columns to either numeric or factor. The class of all
#' columns not specified in num.cols or fac.cols will be set to character.
#'
#' @param df data.frame
#' @param num.cols columns that should be of class numeric
#' @param fac.cols columns that should be of class factor
#'
classify_columns <- function(df, num.cols, fac.cols) {
stopifnot(class(df) == "data.frame")
df <- as.list(df)
if(!missing(fac.cols)) {
stopifnot(all(fac.cols %in% names(df)))
df[!names(df) %in% fac.cols] <- lapply(df[!names(df) %in% fac.cols], as.character)
df[fac.cols] <- lapply(df[fac.cols], as.factor)
} else {
df <- lapply(df, as.character)
}
if(!missing(num.cols)) {
stopifnot(all(num.cols %in% names(df)))
df[num.cols] <- lapply(df[num.cols], as.numeric)
}
return(as.data.frame(df, stringsAsFactors = FALSE))
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment