-
-
Save cbchisanga/f1da8216af0307d2a1ef to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
read.fwf2 <- function (file, widths, header = FALSE, sep = "\t", skip = 0, | |
row.names, col.names, n = -1, ...) | |
{ | |
doone <- function(x) { | |
x <- substring(x, first, last) | |
x[!nzchar(x)] <- NA_character_ | |
paste0(x, collapse = sep) | |
} | |
if (is.list(widths)) { | |
recordlength <- length(widths) | |
widths <- do.call("c", widths) | |
} else { | |
recordlength <- 1L | |
} | |
drop <- (widths < 0L) | |
widths <- abs(widths) | |
if (is.character(file)) { | |
file <- file(file, "rt") | |
on.exit(close(file), add = TRUE) | |
} else if (!isOpen(file)) { | |
open(file, "rt") | |
on.exit(close(file), add = TRUE) | |
} | |
if (skip) | |
readLines(file, n = skip) | |
if (header) { | |
headerline <- readLines(file, n = 1L) | |
text[1] <- headerline | |
} | |
raw <- readLines(file, n = n) | |
nread <- length(raw) | |
if (recordlength > 1L && nread%%recordlength) { | |
raw <- raw[1L:(nread - nread%%recordlength)] | |
warning(sprintf(ngettext(nread%%recordlength, "last record incomplete, %d line discarded", | |
"last record incomplete, %d lines discarded"), | |
nread%%recordlength), domain = NA) | |
} | |
if (recordlength > 1L) { | |
raw <- matrix(raw, nrow = recordlength) | |
raw <- apply(raw, 2L, paste, collapse = "") | |
} | |
st <- c(1L, 1L + cumsum(widths)) | |
first <- st[-length(st)][!drop] | |
last <- cumsum(widths)[!drop] | |
if(header) | |
text <- c(headerline, sapply(raw, doone)) | |
else | |
text <- sapply(raw, doone) | |
read.table(text = text, header = header, sep = sep, row.names = row.names, | |
col.names = col.names, quote = "", ...) | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library("microbenchmark") | |
set.seed(1) | |
ff <- tempfile() | |
dat <- sapply(1:1e4, function(x) paste0(sample(0:9), collapse = "")) | |
cat(file = ff, dat, sep = "\n") | |
identical(read.fwf(ff, widths = c(1,2,3,5)), read.fwf2(ff, widths = c(1,2,3,5))) | |
# [1] TRUE | |
library("readr") | |
identical(read.fwf(ff, widths = c(1,2,3,5)), read_fwf(ff, fwf_widths(c(1,2,3,5)))) | |
# [1] FALSE | |
library("LaF") | |
identical(read.fwf(ff, widths = c(1,2,3,5)), | |
laf_open_fwf(ff, column_types = rep("numeric", 4), column_widths = c(1,2,3,5))) | |
# [1] FALSE | |
utils <- function() read.fwf(ff, widths = c(1,2,3,5)) | |
leeper <- function() read.fwf2(ff, widths = c(1,2,3,5)) | |
hadley <- function() read_fwf(ff, fwf_widths(c(1,2,3,5))) | |
laf <- function() laf_open_fwf(ff, column_types = rep("numeric", 4), column_widths = c(1,2,3,5)) | |
microbenchmark(utils(), leeper(), hadley(), laf(), times = 25) | |
# Unit: milliseconds | |
# expr min lq mean median uq max neval cld | |
# utils() 650.815590 675.114110 690.610924 690.093809 693.658223 760.670054 25 c | |
# leeper() 395.106120 424.477414 431.464297 432.641197 437.710894 480.624730 25 b | |
# hadley() 8.159549 9.066336 11.753338 9.719871 15.496214 18.416717 25 a | |
# laf() 1.226242 1.551084 1.639269 1.628832 1.672708 2.783485 25 a | |
utils <- function() read.fwf(ff, colClasses = rep("numeric", 4), widths = c(1,2,3,5)) | |
leeper <- function() read.fwf2(ff, colClasses = rep("numeric", 4), widths = c(1,2,3,5)) | |
hadley <- function() read_fwf(ff, col_types = "iiii", fwf_widths(c(1,2,3,5))) | |
laf <- function() laf_open_fwf(ff, column_types = rep("numeric", 4), column_widths = c(1,2,3,5)) | |
microbenchmark(utils(), leeper(), hadley(), laf(), times = 25) | |
# Unit: milliseconds | |
# expr min lq mean median uq max neval cld | |
# utils() 648.555934 660.188679 684.65795 668.364392 686.839611 766.526839 25 c | |
# leeper() 397.034390 411.351318 426.30090 419.492392 430.818385 534.639401 25 b | |
# hadley() 928.281034 977.235245 1012.91864 1010.140934 1040.972865 1099.289389 25 d | |
# laf() 1.270504 1.587264 1.59236 1.645381 1.686949 1.795871 25 a |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment