Skip to content

Instantly share code, notes, and snippets.

@mrdwab
Last active March 14, 2023 05:03
Show Gist options
  • Save mrdwab/11380733 to your computer and use it in GitHub Desktop.
Save mrdwab/11380733 to your computer and use it in GitHub Desktop.
The faster version of `concat.split` that makes use of `data.table` efficiency.
### "Unbalanced" data
dat_ub <- data.frame(
header1 = LETTERS[1:5], header2 = LETTERS[6:10],
start = c("1,100", "11,222", "10", "7,8,9,10,11", "1"),
end = c("99,199,299", "33,444,1,2,3,4,5,6", "72", "10,9,8,7,6", "3"))
dat_ub$id <- with(dat_ub,
ave(rep(1, nrow(dat_ub)),
header1, header2,
FUN = seq_along))
### "Balanced" data
dat_b <- data.frame(
header1 = LETTERS[1:5], header2 = LETTERS[6:10],
start = c("1,100,200", "11,222", "10", "7,8,9,10,11", "1"),
end = c("99,199,299", "33,444", "72", "10,9,8,7,6", "3"))
dat_b$id <- with(dat_b,
ave(rep(1, nrow(dat_b)),
header1, header2,
FUN = seq_along))
### Bigger versions of each of the above
dat_50K_ub <- do.call(rbind, replicate(10000, dat_ub, FALSE))
dat_50K_ub$id <- with(dat_50K_ub,
ave(rep(1, nrow(dat_50K_ub)),
header1, header2,
FUN = seq_along))
dat_50K_b <- do.call(rbind, replicate(10000, dat_b, FALSE))
dat_50K_b$id <- with(dat_50K_b,
ave(rep(1, nrow(dat_50K_b)),
header1, header2,
FUN = seq_along))
### Test it out!
cSplit(dat_ub, c("start", "end"), ",", direction="long")
cSplit(dat_b, c("start", "end"), ",")
cSplit(dat_b, c("start", "end"), ",", makeEqual = TRUE)
cSplit(dat_50K_ub, c("start", "end"), ",")
cSplit <- function(indt, splitCols, sep = ",", direction = "wide",
makeEqual = NULL, fixed = TRUE, drop = TRUE,
stripWhite = FALSE) {
message("`cSplit` is now part of the 'splitstackshape' package (V1.4.0)")
## requires data.table >= 1.8.11
require(data.table)
if (!is.data.table(indt)) setDT(indt)
if (is.numeric(splitCols)) splitCols <- names(indt)[splitCols]
if (any(!vapply(indt[, splitCols, with = FALSE],
is.character, logical(1L)))) {
indt[, eval(splitCols) := lapply(.SD, as.character),
.SDcols = splitCols]
}
if (length(sep) == 1)
sep <- rep(sep, length(splitCols))
if (length(sep) != length(splitCols)) {
stop("Verify you have entered the correct number of sep")
}
if (isTRUE(stripWhite)) {
indt[, eval(splitCols) := mapply(function(x, y)
gsub(sprintf("\\s+%s\\s+|\\s+%s|%s\\s+",
x, x, x), x, y),
sep, indt[, splitCols, with = FALSE],
SIMPLIFY = FALSE)]
}
X <- lapply(seq_along(splitCols), function(x) {
strsplit(indt[[splitCols[x]]], split = sep[x], fixed = fixed)
})
if (direction == "long") {
if (is.null(makeEqual)) {
IV <- function(x,y) if (identical(x,y)) TRUE else FALSE
makeEqual <- ifelse(Reduce(IV, rapply(X, length, how = "list")),
FALSE, TRUE)
}
} else if (direction == "wide") {
if (!is.null(makeEqual)) {
if (!isTRUE(makeEqual)) {
message("makeEqual specified as FALSE but set to TRUE")
makeEqual <- TRUE
}
makeEqual <- TRUE
} else {
makeEqual <- TRUE
}
}
if (isTRUE(makeEqual)) {
SetUp <- lapply(seq_along(X), function(y) {
A <- vapply(X[[y]], length, 1L)
list(Mat = cbind(rep(seq_along(A), A), sequence(A)),
Val = unlist(X[[y]]))
})
Ncol <- max(unlist(lapply(SetUp, function(y) y[["Mat"]][, 2]),
use.names = FALSE))
X <- lapply(seq_along(SetUp), function(y) {
M <- matrix(NA_character_, nrow = nrow(indt), ncol = Ncol)
M[SetUp[[y]][["Mat"]]] <- SetUp[[y]][["Val"]]
M
})
if (direction == "wide") {
X <- lapply(seq_along(X), function(x) {
colnames(X[[x]]) <- paste(splitCols[x],
sequence(ncol(X[[x]])),
sep = "_")
X[[x]]
})
if (isTRUE(drop)) {
cbind(indt, do.call(cbind, X))[, eval(splitCols) := NULL][]
} else {
cbind(indt, do.call(cbind, X))
}
} else {
indt <- indt[rep(sequence(nrow(indt)), each = Ncol)]
X <- lapply(X, function(y) as.vector(t(y)))
indt[, eval(splitCols) := lapply(X, unlist, use.names = FALSE)][]
}
} else {
Rep <- vapply(X[[1]], length, integer(1L))
indt <- indt[rep(sequence(nrow(indt)), Rep)]
indt[, eval(splitCols) := lapply(X, unlist, use.names = FALSE)][]
}
}
@sidpat
Copy link

sidpat commented Aug 28, 2014

Surprising fast. Can you add the progress bar functionality? since have a very large dataset to process. Thanks

@mrdwab
Copy link
Author

mrdwab commented Sep 2, 2014

@harmonica2, I'm not sure that I see the need for such a feature, but once setDF is implemented in "data.table" it should be easy enough to incorporate in the function.

@sidpat, I've never actually figured out how to incorporate progress bars in my functions. If you have a lead on how to do so, please feel free to fork the function and try it out. I'd love to know what you find out!

Sorry for the late responses--GitHub doesn't notify us on Gist comments :-(

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment