Skip to content

Instantly share code, notes, and snippets.

@sidpat
Forked from mrdwab/cSplit.R
Created August 28, 2014 07:08
Show Gist options
  • Save sidpat/e37e99ac7fa15820fd08 to your computer and use it in GitHub Desktop.
Save sidpat/e37e99ac7fa15820fd08 to your computer and use it in GitHub Desktop.
### "Unbalanced" data
dat_ub <- data.frame(
header1 = LETTERS[1:5], header2 = LETTERS[6:10],
start = c("1,100", "11,222", "10", "7,8,9,10,11", "1"),
end = c("99,199,299", "33,444,1,2,3,4,5,6", "72", "10,9,8,7,6", "3"))
dat_ub$id <- with(dat_ub,
ave(rep(1, nrow(dat_ub)),
header1, header2,
FUN = seq_along))
### "Balanced" data
dat_b <- data.frame(
header1 = LETTERS[1:5], header2 = LETTERS[6:10],
start = c("1,100,200", "11,222", "10", "7,8,9,10,11", "1"),
end = c("99,199,299", "33,444", "72", "10,9,8,7,6", "3"))
dat_b$id <- with(dat_b,
ave(rep(1, nrow(dat_b)),
header1, header2,
FUN = seq_along))
### Bigger versions of each of the above
dat_50K_ub <- do.call(rbind, replicate(10000, dat_ub, FALSE))
dat_50K_ub$id <- with(dat_50K_ub,
ave(rep(1, nrow(dat_50K_ub)),
header1, header2,
FUN = seq_along))
dat_50K_b <- do.call(rbind, replicate(10000, dat_b, FALSE))
dat_50K_b$id <- with(dat_50K_b,
ave(rep(1, nrow(dat_50K_b)),
header1, header2,
FUN = seq_along))
### Test it out!
cSplit(dat_ub, c("start", "end"), ",", direction="long")
cSplit(dat_b, c("start", "end"), ",")
cSplit(dat_b, c("start", "end"), ",", makeEqual = TRUE)
cSplit(dat_50K_ub, c("start", "end"), ",")
cSplit <- function(indt, splitCols, sep = ",", direction = "wide",
makeEqual = NULL, fixed = TRUE) {
## requires data.table >= 1.8.11
require(data.table)
if (!is.data.table(indt)) setDT(indt)
if (any(!vapply(indt[, splitCols, with = FALSE],
is.character, logical(1L)))) {
indt[, eval(splitCols) := lapply(.SD, as.character),
.SDcols = splitCols]
}
X <- lapply(indt[, splitCols, with = FALSE], function(x) {
strsplit(x, split = sep, fixed = fixed)
})
if (direction == "long") {
if (is.null(makeEqual)) {
IV <- function(x,y) if (identical(x,y)) TRUE else FALSE
makeEqual <- ifelse(Reduce(IV, rapply(X, length, how = "list")),
FALSE, TRUE)
}
} else if (direction == "wide") {
if (!is.null(makeEqual)) {
if (!isTRUE(makeEqual)) {
message("makeEqual specified as FALSE but set to TRUE")
makeEqual <- TRUE
}
makeEqual <- TRUE
} else {
makeEqual <- TRUE
}
}
if (isTRUE(makeEqual)) {
SetUp <- lapply(seq_along(X), function(y) {
A <- vapply(X[[y]], length, 1L)
list(Mat = cbind(rep(seq_along(A), A), sequence(A)),
Val = unlist(X[[y]]))
})
Ncol <- max(unlist(lapply(SetUp, function(y) y[["Mat"]][, 2]),
use.names = FALSE))
X <- lapply(seq_along(SetUp), function(y) {
M <- matrix(NA_character_, nrow = nrow(indt), ncol = Ncol)
M[SetUp[[y]][["Mat"]]] <- SetUp[[y]][["Val"]]
M
})
if (direction == "wide") {
X <- lapply(seq_along(X), function(x) {
colnames(X[[x]]) <- paste(splitCols[x],
sequence(ncol(X[[x]])),
sep = "_")
X[[x]]
})
cbind(indt, do.call(cbind, X))[, eval(splitCols) := NULL][]
} else {
indt <- indt[rep(sequence(nrow(indt)), each = Ncol)]
X <- lapply(X, function(y) as.vector(t(y)))
indt[, eval(splitCols) := lapply(X, unlist, use.names = FALSE)][]
}
} else {
Rep <- vapply(X[[1]], length, integer(1L))
indt <- indt[rep(sequence(nrow(indt)), Rep)]
indt[, eval(splitCols) := lapply(X, unlist, use.names = FALSE)][]
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment