mrdwab · March 14, 2023 05:03 · sidpat · Aug 28, 2014 · mrdwab · Sep 2, 2014
diff --git a/csLong_sample.R b/csLong_sample.R
 ### "Unbalanced" data

 dat_ub <- data.frame(
  header1 = LETTERS[1:5], header2 = LETTERS[6:10], 
  start = c("1,100", "11,222", "10", "7,8,9,10,11", "1"),
  end = c("99,199,299", "33,444,1,2,3,4,5,6", "72", "10,9,8,7,6", "3"))
 
 dat_ub$id <- with(dat_ub, 
                  ave(rep(1, nrow(dat_ub)), 
                      header1, header2, 
                      FUN = seq_along))
 
 ### "Balanced" data
 
 dat_b <- data.frame(
  header1 = LETTERS[1:5], header2 = LETTERS[6:10], 
  start = c("1,100,200", "11,222", "10", "7,8,9,10,11", "1"),
  end = c("99,199,299", "33,444", "72", "10,9,8,7,6", "3"))
 
 dat_b$id <- with(dat_b, 
                 ave(rep(1, nrow(dat_b)), 
                     header1, header2, 
                     FUN = seq_along))
 
 ### Bigger versions of each of the above
 
 dat_50K_ub <- do.call(rbind, replicate(10000, dat_ub, FALSE))
 dat_50K_ub$id <- with(dat_50K_ub, 
                      ave(rep(1, nrow(dat_50K_ub)), 
                          header1, header2, 
                          FUN = seq_along))
 
 dat_50K_b <- do.call(rbind, replicate(10000, dat_b, FALSE))
 dat_50K_b$id <- with(dat_50K_b, 
                     ave(rep(1, nrow(dat_50K_b)), 
                         header1, header2, 
                         FUN = seq_along))

 ### Test it out!

 cSplit(dat_ub, c("start", "end"), ",", direction="long")
 cSplit(dat_b, c("start", "end"), ",")
 cSplit(dat_b, c("start", "end"), ",", makeEqual = TRUE)
 cSplit(dat_50K_ub, c("start", "end"), ",")
diff --git a/cSplit.R b/cSplit.R
 cSplit <- function(indt, splitCols, sep = ",", direction = "wide", 
                   makeEqual = NULL, fixed = TRUE, drop = TRUE, 
                   stripWhite = FALSE) {
  message("`cSplit` is now part of the 'splitstackshape' package (V1.4.0)")
  ## requires data.table >= 1.8.11
  require(data.table)
  if (!is.data.table(indt)) setDT(indt)
  if (is.numeric(splitCols)) splitCols <- names(indt)[splitCols]
  if (any(!vapply(indt[, splitCols, with = FALSE],
                  is.character, logical(1L)))) {
    indt[, eval(splitCols) := lapply(.SD, as.character),
         .SDcols = splitCols]
  }
  
  if (length(sep) == 1) 
    sep <- rep(sep, length(splitCols))
  if (length(sep) != length(splitCols)) {
    stop("Verify you have entered the correct number of sep")
  }

  if (isTRUE(stripWhite)) {
    indt[, eval(splitCols) := mapply(function(x, y) 
      gsub(sprintf("\\s+%s\\s+|\\s+%s|%s\\s+", 
                   x, x, x), x, y), 
      sep, indt[, splitCols, with = FALSE], 
      SIMPLIFY = FALSE)]
  }  
  
  X <- lapply(seq_along(splitCols), function(x) {
    strsplit(indt[[splitCols[x]]], split = sep[x], fixed = fixed)
  })
  
  if (direction == "long") {
    if (is.null(makeEqual)) {
      IV <- function(x,y) if (identical(x,y)) TRUE else FALSE
      makeEqual <- ifelse(Reduce(IV, rapply(X, length, how = "list")),
                          FALSE, TRUE)
    }
  } else if (direction == "wide") {
    if (!is.null(makeEqual)) {
      if (!isTRUE(makeEqual)) {
        message("makeEqual specified as FALSE but set to TRUE")
        makeEqual <- TRUE
      }
      makeEqual <- TRUE
    } else {
      makeEqual <- TRUE
    }
  }
  if (isTRUE(makeEqual)) {
    SetUp <- lapply(seq_along(X), function(y) {
      A <- vapply(X[[y]], length, 1L)
      list(Mat = cbind(rep(seq_along(A), A), sequence(A)),
           Val = unlist(X[[y]]))
    })    
    Ncol <- max(unlist(lapply(SetUp, function(y) y[["Mat"]][, 2]), 
                       use.names = FALSE))
    X <- lapply(seq_along(SetUp), function(y) {
      M <- matrix(NA_character_, nrow = nrow(indt), ncol = Ncol)
      M[SetUp[[y]][["Mat"]]] <- SetUp[[y]][["Val"]]
      M
    })
    if (direction == "wide") {
      X <- lapply(seq_along(X), function(x) {
        colnames(X[[x]]) <- paste(splitCols[x], 
                                  sequence(ncol(X[[x]])), 
                                  sep = "_")
        X[[x]]
      })
      if (isTRUE(drop)) {
        cbind(indt, do.call(cbind, X))[, eval(splitCols) := NULL][]
      } else {
        cbind(indt, do.call(cbind, X))
      }
    } else {
      indt <- indt[rep(sequence(nrow(indt)), each = Ncol)]
      X <- lapply(X, function(y) as.vector(t(y)))
      indt[, eval(splitCols) := lapply(X, unlist, use.names = FALSE)][]
    }  
  } else {
    Rep <- vapply(X[[1]], length, integer(1L))
    indt <- indt[rep(sequence(nrow(indt)), Rep)]
    indt[, eval(splitCols) := lapply(X, unlist, use.names = FALSE)][]
  }
 }
	### "Unbalanced" data

	dat_ub <- data.frame(
	header1 = LETTERS[1:5], header2 = LETTERS[6:10],
	start = c("1,100", "11,222", "10", "7,8,9,10,11", "1"),
	end = c("99,199,299", "33,444,1,2,3,4,5,6", "72", "10,9,8,7,6", "3"))

	dat_ub$id <- with(dat_ub,
	ave(rep(1, nrow(dat_ub)),
	header1, header2,
	FUN = seq_along))

	### "Balanced" data

	dat_b <- data.frame(
	header1 = LETTERS[1:5], header2 = LETTERS[6:10],
	start = c("1,100,200", "11,222", "10", "7,8,9,10,11", "1"),
	end = c("99,199,299", "33,444", "72", "10,9,8,7,6", "3"))

	dat_b$id <- with(dat_b,
	ave(rep(1, nrow(dat_b)),
	header1, header2,
	FUN = seq_along))

	### Bigger versions of each of the above

	dat_50K_ub <- do.call(rbind, replicate(10000, dat_ub, FALSE))
	dat_50K_ub$id <- with(dat_50K_ub,
	ave(rep(1, nrow(dat_50K_ub)),
	header1, header2,
	FUN = seq_along))

	dat_50K_b <- do.call(rbind, replicate(10000, dat_b, FALSE))
	dat_50K_b$id <- with(dat_50K_b,
	ave(rep(1, nrow(dat_50K_b)),
	header1, header2,
	FUN = seq_along))

	### Test it out!

	cSplit(dat_ub, c("start", "end"), ",", direction="long")
	cSplit(dat_b, c("start", "end"), ",")
	cSplit(dat_b, c("start", "end"), ",", makeEqual = TRUE)
	cSplit(dat_50K_ub, c("start", "end"), ",")
	cSplit <- function(indt, splitCols, sep = ",", direction = "wide",
	makeEqual = NULL, fixed = TRUE, drop = TRUE,
	stripWhite = FALSE) {
	message("`cSplit` is now part of the 'splitstackshape' package (V1.4.0)")
	## requires data.table >= 1.8.11
	require(data.table)
	if (!is.data.table(indt)) setDT(indt)
	if (is.numeric(splitCols)) splitCols <- names(indt)[splitCols]
	if (any(!vapply(indt[, splitCols, with = FALSE],
	is.character, logical(1L)))) {
	indt[, eval(splitCols) := lapply(.SD, as.character),
	.SDcols = splitCols]
	}

	if (length(sep) == 1)
	sep <- rep(sep, length(splitCols))
	if (length(sep) != length(splitCols)) {
	stop("Verify you have entered the correct number of sep")
	}

	if (isTRUE(stripWhite)) {
	indt[, eval(splitCols) := mapply(function(x, y)
	gsub(sprintf("\\s+%s\\s+\|\\s+%s\|%s\\s+",
	x, x, x), x, y),
	sep, indt[, splitCols, with = FALSE],
	SIMPLIFY = FALSE)]
	}

	X <- lapply(seq_along(splitCols), function(x) {
	strsplit(indt[[splitCols[x]]], split = sep[x], fixed = fixed)
	})

	if (direction == "long") {
	if (is.null(makeEqual)) {
	IV <- function(x,y) if (identical(x,y)) TRUE else FALSE
	makeEqual <- ifelse(Reduce(IV, rapply(X, length, how = "list")),
	FALSE, TRUE)
	}
	} else if (direction == "wide") {
	if (!is.null(makeEqual)) {
	if (!isTRUE(makeEqual)) {
	message("makeEqual specified as FALSE but set to TRUE")
	makeEqual <- TRUE
	}
	makeEqual <- TRUE
	} else {
	makeEqual <- TRUE
	}
	}
	if (isTRUE(makeEqual)) {
	SetUp <- lapply(seq_along(X), function(y) {
	A <- vapply(X[[y]], length, 1L)
	list(Mat = cbind(rep(seq_along(A), A), sequence(A)),
	Val = unlist(X[[y]]))
	})
	Ncol <- max(unlist(lapply(SetUp, function(y) y[["Mat"]][, 2]),
	use.names = FALSE))
	X <- lapply(seq_along(SetUp), function(y) {
	M <- matrix(NA_character_, nrow = nrow(indt), ncol = Ncol)
	M[SetUp[[y]][["Mat"]]] <- SetUp[[y]][["Val"]]
	M
	})
	if (direction == "wide") {
	X <- lapply(seq_along(X), function(x) {
	colnames(X[[x]]) <- paste(splitCols[x],
	sequence(ncol(X[[x]])),
	sep = "_")
	X[[x]]
	})
	if (isTRUE(drop)) {
	cbind(indt, do.call(cbind, X))[, eval(splitCols) := NULL][]
	} else {
	cbind(indt, do.call(cbind, X))
	}
	} else {
	indt <- indt[rep(sequence(nrow(indt)), each = Ncol)]
	X <- lapply(X, function(y) as.vector(t(y)))
	indt[, eval(splitCols) := lapply(X, unlist, use.names = FALSE)][]
	}
	} else {
	Rep <- vapply(X[[1]], length, integer(1L))
	indt <- indt[rep(sequence(nrow(indt)), Rep)]
	indt[, eval(splitCols) := lapply(X, unlist, use.names = FALSE)][]
	}
	}