peterhurford · November 5, 2024 08:13 · randy3k · Jun 5, 2020 · peterhurford · Jun 5, 2020
diff --git a/num_rows_csv.R b/num_rows_csv.R
 # What's the fastest way to determine the number of rows of a CSV in R?
 # ...Reading the entire CSV to only get the dimensions is likely too slow. Is there a faster way?
 # Benchmarks done on a EC2 r3.8xlarge
 # Cowritten with Abel Castillo <github.com/abelcastilloavant>

 m <- 1000000
 d <- data.frame(id = seq(m), a = rnorm(m), b = runif(m))
 dim(d)
 # [1] 1000000       3
 pryr::object_size(d)
 # 20 MB

 readr::write_csv(d, "tmp.csv")

 microbenchmark::microbenchmark(
 {lines <- 0; f <- file("tmp.csv", "r"); while (TRUE) {
    line <- readLines(f, n = 1)
    if (length(line) == 0) { break }; lines <- lines + 1}
    print(lines - 1) },                                                                # 2784.9ms
 { sqldf::read.csv.sql("tmp.csv", "select count(*) from file")[[1]] },                 # 2103.0ms
 { length(readLines("tmp.csv")) - 1 },                                                 # 1750.3ms
 { length(count.fields("tmp.csv")) - 1 },                                              # 1519.1ms
 { R.utils::countLines("tmp.csv")[[1]] - 1 },                                          # 1071.3ms
 { dim(data.table::fread("tmp.csv"))[[1]] },                                           # 493.4ms
 { NROW(data.table::fread("tmp.csv")) },                                               # 472.7ms
 { dim(readr::read_csv("tmp.csv"))[[1]] },                                             # 414.4ms
 { NROW(readr::read_csv("tmp.csv")) },                                                 # 391.7ms
 { length(readr::count_fields("tmp.csv", tokenizer = readr::tokenizer_csv())) - 1 },   # 254.8ms
 { as.integer(strsplit(system("wc -l tmp.csv", intern = TRUE), " ")[[1]][[1]]) - 1 },  # 24.9ms
 { as.numeric(system("cat tmp.csv | wc -l", intern = TRUE)) - 1 },                     # 17.9ms
 times = 4)
	# What's the fastest way to determine the number of rows of a CSV in R?
	# ...Reading the entire CSV to only get the dimensions is likely too slow. Is there a faster way?
	# Benchmarks done on a EC2 r3.8xlarge
	# Cowritten with Abel Castillo <github.com/abelcastilloavant>

	m <- 1000000
	d <- data.frame(id = seq(m), a = rnorm(m), b = runif(m))
	dim(d)
	# [1] 1000000 3
	pryr::object_size(d)
	# 20 MB

	readr::write_csv(d, "tmp.csv")

	microbenchmark::microbenchmark(
	{lines <- 0; f <- file("tmp.csv", "r"); while (TRUE) {
	line <- readLines(f, n = 1)
	if (length(line) == 0) { break }; lines <- lines + 1}
	print(lines - 1) }, # 2784.9ms
	{ sqldf::read.csv.sql("tmp.csv", "select count(*) from file")[[1]] }, # 2103.0ms
	{ length(readLines("tmp.csv")) - 1 }, # 1750.3ms
	{ length(count.fields("tmp.csv")) - 1 }, # 1519.1ms
	{ R.utils::countLines("tmp.csv")[[1]] - 1 }, # 1071.3ms
	{ dim(data.table::fread("tmp.csv"))[[1]] }, # 493.4ms
	{ NROW(data.table::fread("tmp.csv")) }, # 472.7ms
	{ dim(readr::read_csv("tmp.csv"))[[1]] }, # 414.4ms
	{ NROW(readr::read_csv("tmp.csv")) }, # 391.7ms
	{ length(readr::count_fields("tmp.csv", tokenizer = readr::tokenizer_csv())) - 1 }, # 254.8ms
	{ as.integer(strsplit(system("wc -l tmp.csv", intern = TRUE), " ")[[1]][[1]]) - 1 }, # 24.9ms
	{ as.numeric(system("cat tmp.csv \| wc -l", intern = TRUE)) - 1 }, # 17.9ms
	times = 4)