Skip to content

Instantly share code, notes, and snippets.

@ronalstal
Created January 7, 2015 10:16
Show Gist options
  • Save ronalstal/3986e36462aeb15196f3 to your computer and use it in GitHub Desktop.
Save ronalstal/3986e36462aeb15196f3 to your computer and use it in GitHub Desktop.
compare different methods to read and sub-set a big file
# Course Data Scientist - Exploratory Data Analysis - Course Project 1
# compare the different methods of reading in
# the file "household_power_consumption.txt" and
# subsetting Date to 1/2/2007 and 2/2/2007
library(data.table)
library(sqldf)
Sys.setlocale(category = "LC_MESSAGES", locale = "C") # english output of system.time
dataFile <- "household_power_consumption.txt"
tdt <- system.time({
DT <- fread(
paste("grep ^[12]/2/2007", dataFile),
na.strings = c("?", ""))
# "grep" lost the headers, so get them
setnames(DT, colnames(fread(dataFile, nrows=0)))
})
print("data.table")
print(tdt)
tdf <- system.time({
DF <- read.csv(
dataFile, sep=";",
stringsAsFactors=FALSE,
na.strings = c("?", ""))
# sub-set the Date
DF <- DF[DF$Date %in% c("1/2/2007","2/2/2007"), ]
})
print("read.csv")
print(tdf)
tds <- system.time({
DS <- read.csv.sql(
file=dataFile, sep=";", header=TRUE,
sql="select * from file where Date in ('1/2/2007','2/2/2007')",
stringsAsFactors=FALSE,)
})
print("read.csv.sql")
print(tds)
print(summary(DT))
print(summary(DF))
print(summary(DS))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment