mrdwab · March 10, 2018 11:39 · mrdwab · Mar 8, 2018
diff --git a/cSplit_fread_tests.R b/cSplit_fread_tests.R
 dt1 <- fread("V1 V2 V3
             x b;c;d 1
             y d;ef  2
             z d;ef  3
             m tmp   4
             n tmp   5
             n x;yz  5")
 dt1[4, V2:=''] # this record will be lost because V2 value is empty string
 dt1[5, V2:=NA_character_] # NA value is processed correctly

 # Expect 11 x 3 data.table. Empty values in rows 8 & 9
 cSplit_fread(dt1, "V2", ";", "long")

 # Expect 18 x 3 data.table, 3 rows for each original row
 cSplit_fread(dt1, "V2", ";", "long", makeEqual = TRUE)

 dt2 <- fread("V1 V2       V3
              x  xA;xB;xC x1;x2;x3
              y  yD       y1
              z  zF;zG    z1")

 # Expect 6 x 3, NA at [6, 3]
 cSplit_fread(dt2, c("V2", "V3"), ";")

 DT <- structure(list(Name = structure(
  c(2L, 1L, 3L), .Label = c("Marcia", "Marcos", "Miguel"), class = "factor"), 
  Likes = structure(1:3, .Label = c("1,2,4,5,6", "1,2,5", "1,2,5,6"), class = "factor"), 
  Siblings = structure(c(3L, 1L, 2L), .Label = c("", "Parks , Ada", "Scott "), class = "factor"), 
  Hates = structure(c(2L, 1L, 3L), .Label = c("1;2;4;", "1;3;4;", "3;"), class = "factor")), 
  .Names = c("Name", "Likes", "Siblings", "Hates"), row.names = c(39L, 44L, 36L), class = "data.frame")
 setDT(DT)

 # Expect 1 + 5 + 2 + 4 columns
 cSplit_fread(DT, c("Likes", "Siblings", "Hates"), c(",", ",", ";"))

 # Expect 1 + 5 + 2 + 4 columns
 # Expect "Siblings" to have extra whitespace
 cSplit_fread(DT, c("Likes", "Siblings", "Hates"), c(",", ",", ";"), stripWhite = FALSE)

 # Expect 1 + 5 + 5 + 5 columns
 cSplit_fread(DT, c("Likes", "Siblings", "Hates"), c(",", ",", ";"), makeEqual = TRUE)

 # Expect 12 x 4 data.table
 cSplit_fread(DT, c("Likes", "Siblings", "Hates"), c(",", ",", ";"), "long")

 # Expect 15 x 4 data.table -- like melt(cSplit_fread(...), patterns(splitcols))
 cSplit_fread(DT, c("Likes", "Siblings", "Hates"), c(",", ",", ";"), "long")
diff --git a/fSplit.R b/fSplit.R
 cSplit_fread <- function(indt, splitCols, sep = ",", direction = "wide", 
                         fixed = TRUE, drop = TRUE, stripWhite = TRUE, 
                         makeEqual = NULL, type.convert = TRUE) {
  
  o_names <- names(indt)
  indt <- setDT(copy(indt))
  if (is.numeric(splitCols)) splitCols <- names(indt)[splitCols]
  if (length(sep) == 1) sep <- rep(sep, length(splitCols))
  if (length(sep) != length(splitCols)) stop("Wrong number of sep supplied")
  
  for (i in seq_along(splitCols)) {
    if (isTRUE(fixed)) {
      temp <- f_split(indt[[splitCols[i]]], sep[i], fixed, stripWhite, type.convert)
      set(indt, j = sprintf("%s_%d", splitCols[i], seq_along(temp)), value = temp)
      if (isTRUE(drop)) set(indt, j = splitCols[i], value = NULL)
    } else {
      temp <- t_split(indt[[splitCols[i]]], sep[i], fixed, stripWhite, type.convert)
      set(indt, j = sprintf("%s_%d", splitCols[i], seq_along(temp)), value = temp)
      if (isTRUE(drop)) set(indt, j = splitCols[i], value = NULL)
    }
  }
  
  if (direction == "wide") {
    if (isTRUE(makeEqual)) make_equal(indt, splitCols)[] else indt[]
  } else if (direction == "long") {
    indt[, ._ID1 := .I]
    out <- set(suppressWarnings(
      melt(make_equal(indt, splitCols), measure = patterns(splitCols),
           value.name = splitCols)), j = "variable", value = NULL)
    out[, ._ID2 := sequence(.N), ._ID1]
    setorderv(out, c("._ID1", "._ID2"), na.last = TRUE)
    out <- if (isTRUE(makeEqual)) out else out[!(long_fixer(out, splitCols) & ._ID2 != 1)]
    set(out, j = c("._ID1", "._ID2"), value = NULL)
    setcolorder(out, o_names)[]
  }
 }

 long_fixer <- function(indt, cols) {
  temp <- rowSums(is.na(indt[, ..cols])) + rowSums(indt[, ..cols] == "", na.rm = TRUE)
  temp == length(cols)
 }

 make_equal <- function(indt, splitCols) {
  check <- all_names(names(indt), stubs = splitCols, end_stub = FALSE)
  
  if (length(check[["miss"]]) > 0) {
    nat <- vapply(unname(check[["stubs"]]), function(x) {
      typeof(indt[[grep(x, names(indt), fixed = TRUE)[[1]]]])
    }, character(1L))
    
    for (i in seq_along(nat)) {
      COLS <- grep(names(nat[i]), check[["miss"]], value = TRUE, fixed = TRUE)
      if (length(COLS) > 0) indt[, (COLS) := NA_type(nat[i])]
    } 
  } 
  
  setcolorder(indt, check[["full_names"]])
 }

 NA_type <- function(string) {
  switch(string,
         double = NA_real_,
         integer = NA_integer_,
         complex = NA_complex_,
         character = NA_character_,
         NA)
 }

 all_names <- function(current_names, stubs, end_stub = FALSE) {
  stub_names <- grep(paste(stubs, collapse = "|"), current_names, value = TRUE)
  id_names <- grep(paste(stubs, collapse = "|"), current_names, value = TRUE, invert = TRUE)
  levs <- unique(gsub(paste(stubs, collapse = "|"), "", stub_names))
  stub_levs <- gsub("^\\^|\\$$", "", 
                    do.call(paste0, expand.grid(stubs, levs)[if (end_stub) 2:1 else 1:2]))
  full_names <- c(id_names, stub_levs[order(stub_levs)])
  levs <- gsub("^[[:punct:]]|[[:punct:]]$", "", levs)
  levs <- levs[order(levs)]
  miss <- setdiff(full_names, current_names)
  list(stubs = stubs,
       stub_names = stub_names, 
       id_names = id_names, 
       levs = levs, 
       stub_levs = stub_levs, 
       full_names = full_names, 
       miss = miss)
 }

 t_split <- function(vec, sep, fixed = TRUE, stripWhite = TRUE, type.convert = TRUE) {
  temp <- tstrsplit(vec, sep, fixed)
  if (!isTRUE(stripWhite) & isTRUE(type.convert)) {
    message("type.convert requires stripWhite = TRUE. Changing setting.")
  }
  if (isTRUE(stripWhite)) temp <- trim_list(temp, convert = type.convert)
  setDT(temp)[]
 }

 f_split <- function(vec, sep, fixed = TRUE, stripWhite = TRUE, type.convert = TRUE) {
  if ((sep == "") | (nchar(sep) > 1L) | (!isTRUE(fixed))) {
    temp <- t_split(vec, sep, fixed, stripWhite, type.convert)
  } else {
    VEC <- as.character(vec)
    ana <- if (anyNA(VEC)) is.na(VEC) else NULL
    anb <- !nzchar(VEC)
    
    if (is.null(ana) & !any(anb)) {
      VEC <- stri_flatten(VEC, collapse = "\n")
    } else {
      if (!is.null(ana)) VEC[which(ana)] <- sep
      if (any(anb)) VEC[which(anb)] <- sep
      VEC <- stri_flatten(VEC, collapse = "\n")
    }
    
    temp <- fread(VEC, sep = sep, fill = TRUE, 
                  blank.lines.skip = FALSE, header = FALSE,
                  colClasses = if (!isTRUE(type.convert)) "character" else NULL,
                  strip.white = stripWhite, logical01 = FALSE)[, lapply(
                    .SD, function(x) replace(x, x ==  "", NA))]
    
    if (length(temp) == 1L) {
      temp <- t_split(vec, sep, fixed, stripWhite, type.convert)
    } 
  }
  
  temp
 }

 ## TESTS -------------------

 cSplit_fread(concat.test, c("Likes", "Hates", "Siblings"), c(",", ";", ","))
 cSplit_fread(concat.test, c("Likes", "Hates", "Siblings"), c(",", ";", ","))
 cSplit_fread(concat.test, c("Likes", "Hates", "Siblings")) ## misses the ";" separated data
 cSplit_fread(concat.test, c("Likes", "Hates", "Siblings"), c(",", ";", ","), makeEqual = TRUE)
 cSplit_fread(concat.test, c("Likes", "Hates", "Siblings"), c(",", ";", ","), direction = "long")
 cSplit_fread(concat.test, c("Likes", "Hates", "Siblings"), c(",", ";", ","), direction = "long", makeEqual = FALSE)

 DT <- as.data.table(concat.test)
 DT_1000 <- rbindlist(replicate(ceiling(1000/nrow(DT)), DT, FALSE))
 DT_100K <- rbindlist(replicate(ceiling(100000/nrow(DT_1000)), DT_1000, FALSE))
 DT_10M <- rbindlist(replicate(100, DT_100K, FALSE))
	dt1 <- fread("V1 V2 V3
	x b;c;d 1
	y d;ef 2
	z d;ef 3
	m tmp 4
	n tmp 5
	n x;yz 5")
	dt1[4, V2:=''] # this record will be lost because V2 value is empty string
	dt1[5, V2:=NA_character_] # NA value is processed correctly

	# Expect 11 x 3 data.table. Empty values in rows 8 & 9
	cSplit_fread(dt1, "V2", ";", "long")

	# Expect 18 x 3 data.table, 3 rows for each original row
	cSplit_fread(dt1, "V2", ";", "long", makeEqual = TRUE)

	dt2 <- fread("V1 V2 V3
	x xA;xB;xC x1;x2;x3
	y yD y1
	z zF;zG z1")

	# Expect 6 x 3, NA at [6, 3]
	cSplit_fread(dt2, c("V2", "V3"), ";")

	DT <- structure(list(Name = structure(
	c(2L, 1L, 3L), .Label = c("Marcia", "Marcos", "Miguel"), class = "factor"),
	Likes = structure(1:3, .Label = c("1,2,4,5,6", "1,2,5", "1,2,5,6"), class = "factor"),
	Siblings = structure(c(3L, 1L, 2L), .Label = c("", "Parks , Ada", "Scott "), class = "factor"),
	Hates = structure(c(2L, 1L, 3L), .Label = c("1;2;4;", "1;3;4;", "3;"), class = "factor")),
	.Names = c("Name", "Likes", "Siblings", "Hates"), row.names = c(39L, 44L, 36L), class = "data.frame")
	setDT(DT)

	# Expect 1 + 5 + 2 + 4 columns
	cSplit_fread(DT, c("Likes", "Siblings", "Hates"), c(",", ",", ";"))

	# Expect 1 + 5 + 2 + 4 columns
	# Expect "Siblings" to have extra whitespace
	cSplit_fread(DT, c("Likes", "Siblings", "Hates"), c(",", ",", ";"), stripWhite = FALSE)

	# Expect 1 + 5 + 5 + 5 columns
	cSplit_fread(DT, c("Likes", "Siblings", "Hates"), c(",", ",", ";"), makeEqual = TRUE)

	# Expect 12 x 4 data.table
	cSplit_fread(DT, c("Likes", "Siblings", "Hates"), c(",", ",", ";"), "long")

	# Expect 15 x 4 data.table -- like melt(cSplit_fread(...), patterns(splitcols))
	cSplit_fread(DT, c("Likes", "Siblings", "Hates"), c(",", ",", ";"), "long")
	cSplit_fread <- function(indt, splitCols, sep = ",", direction = "wide",
	fixed = TRUE, drop = TRUE, stripWhite = TRUE,
	makeEqual = NULL, type.convert = TRUE) {

	o_names <- names(indt)
	indt <- setDT(copy(indt))
	if (is.numeric(splitCols)) splitCols <- names(indt)[splitCols]
	if (length(sep) == 1) sep <- rep(sep, length(splitCols))
	if (length(sep) != length(splitCols)) stop("Wrong number of sep supplied")

	for (i in seq_along(splitCols)) {
	if (isTRUE(fixed)) {
	temp <- f_split(indt[[splitCols[i]]], sep[i], fixed, stripWhite, type.convert)
	set(indt, j = sprintf("%s_%d", splitCols[i], seq_along(temp)), value = temp)
	if (isTRUE(drop)) set(indt, j = splitCols[i], value = NULL)
	} else {
	temp <- t_split(indt[[splitCols[i]]], sep[i], fixed, stripWhite, type.convert)
	set(indt, j = sprintf("%s_%d", splitCols[i], seq_along(temp)), value = temp)
	if (isTRUE(drop)) set(indt, j = splitCols[i], value = NULL)
	}
	}

	if (direction == "wide") {
	if (isTRUE(makeEqual)) make_equal(indt, splitCols)[] else indt[]
	} else if (direction == "long") {
	indt[, ._ID1 := .I]
	out <- set(suppressWarnings(
	melt(make_equal(indt, splitCols), measure = patterns(splitCols),
	value.name = splitCols)), j = "variable", value = NULL)
	out[, ._ID2 := sequence(.N), ._ID1]
	setorderv(out, c("._ID1", "._ID2"), na.last = TRUE)
	out <- if (isTRUE(makeEqual)) out else out[!(long_fixer(out, splitCols) & ._ID2 != 1)]
	set(out, j = c("._ID1", "._ID2"), value = NULL)
	setcolorder(out, o_names)[]
	}
	}

	long_fixer <- function(indt, cols) {
	temp <- rowSums(is.na(indt[, ..cols])) + rowSums(indt[, ..cols] == "", na.rm = TRUE)
	temp == length(cols)
	}

	make_equal <- function(indt, splitCols) {
	check <- all_names(names(indt), stubs = splitCols, end_stub = FALSE)

	if (length(check[["miss"]]) > 0) {
	nat <- vapply(unname(check[["stubs"]]), function(x) {
	typeof(indt[[grep(x, names(indt), fixed = TRUE)[[1]]]])
	}, character(1L))

	for (i in seq_along(nat)) {
	COLS <- grep(names(nat[i]), check[["miss"]], value = TRUE, fixed = TRUE)
	if (length(COLS) > 0) indt[, (COLS) := NA_type(nat[i])]
	}
	}

	setcolorder(indt, check[["full_names"]])
	}

	NA_type <- function(string) {
	switch(string,
	double = NA_real_,
	integer = NA_integer_,
	complex = NA_complex_,
	character = NA_character_,
	NA)
	}

	all_names <- function(current_names, stubs, end_stub = FALSE) {
	stub_names <- grep(paste(stubs, collapse = "\|"), current_names, value = TRUE)
	id_names <- grep(paste(stubs, collapse = "\|"), current_names, value = TRUE, invert = TRUE)
	levs <- unique(gsub(paste(stubs, collapse = "\|"), "", stub_names))
	stub_levs <- gsub("^\\^\|\\$$", "",
	do.call(paste0, expand.grid(stubs, levs)[if (end_stub) 2:1 else 1:2]))
	full_names <- c(id_names, stub_levs[order(stub_levs)])
	levs <- gsub("^[[:punct:]]\|[[:punct:]]$", "", levs)
	levs <- levs[order(levs)]
	miss <- setdiff(full_names, current_names)
	list(stubs = stubs,
	stub_names = stub_names,
	id_names = id_names,
	levs = levs,
	stub_levs = stub_levs,
	full_names = full_names,
	miss = miss)
	}

	t_split <- function(vec, sep, fixed = TRUE, stripWhite = TRUE, type.convert = TRUE) {
	temp <- tstrsplit(vec, sep, fixed)
	if (!isTRUE(stripWhite) & isTRUE(type.convert)) {
	message("type.convert requires stripWhite = TRUE. Changing setting.")
	}
	if (isTRUE(stripWhite)) temp <- trim_list(temp, convert = type.convert)
	setDT(temp)[]
	}

	f_split <- function(vec, sep, fixed = TRUE, stripWhite = TRUE, type.convert = TRUE) {
	if ((sep == "") \| (nchar(sep) > 1L) \| (!isTRUE(fixed))) {
	temp <- t_split(vec, sep, fixed, stripWhite, type.convert)
	} else {
	VEC <- as.character(vec)
	ana <- if (anyNA(VEC)) is.na(VEC) else NULL
	anb <- !nzchar(VEC)

	if (is.null(ana) & !any(anb)) {
	VEC <- stri_flatten(VEC, collapse = "\n")
	} else {
	if (!is.null(ana)) VEC[which(ana)] <- sep
	if (any(anb)) VEC[which(anb)] <- sep
	VEC <- stri_flatten(VEC, collapse = "\n")
	}

	temp <- fread(VEC, sep = sep, fill = TRUE,
	blank.lines.skip = FALSE, header = FALSE,
	colClasses = if (!isTRUE(type.convert)) "character" else NULL,
	strip.white = stripWhite, logical01 = FALSE)[, lapply(
	.SD, function(x) replace(x, x == "", NA))]

	if (length(temp) == 1L) {
	temp <- t_split(vec, sep, fixed, stripWhite, type.convert)
	}
	}

	temp
	}

	## TESTS -------------------

	cSplit_fread(concat.test, c("Likes", "Hates", "Siblings"), c(",", ";", ","))
	cSplit_fread(concat.test, c("Likes", "Hates", "Siblings"), c(",", ";", ","))
	cSplit_fread(concat.test, c("Likes", "Hates", "Siblings")) ## misses the ";" separated data
	cSplit_fread(concat.test, c("Likes", "Hates", "Siblings"), c(",", ";", ","), makeEqual = TRUE)
	cSplit_fread(concat.test, c("Likes", "Hates", "Siblings"), c(",", ";", ","), direction = "long")
	cSplit_fread(concat.test, c("Likes", "Hates", "Siblings"), c(",", ";", ","), direction = "long", makeEqual = FALSE)

	DT <- as.data.table(concat.test)
	DT_1000 <- rbindlist(replicate(ceiling(1000/nrow(DT)), DT, FALSE))
	DT_100K <- rbindlist(replicate(ceiling(100000/nrow(DT_1000)), DT_1000, FALSE))
	DT_10M <- rbindlist(replicate(100, DT_100K, FALSE))