Skip to content

Instantly share code, notes, and snippets.

@vnijs
Last active August 29, 2015 14:19
Show Gist options
  • Save vnijs/ebd23de86bd971435eca to your computer and use it in GitHub Desktop.
Save vnijs/ebd23de86bd971435eca to your computer and use it in GitHub Desktop.
library(DBI)
library(RSQLite)
library(readr)
library(dplyr)
#=========================================================================
# Read a CSV file in chunks and stuff into a database.
#
# Note: This is proof-of-concept code only.
# It should work OK, but there are no guarantees.
# There's nothing in the way of error checking.
#
# The key advantage of this method (over direct import with DBI/Rsqlite/sqldf)
# is that is uses readr::read_csv to parse the input file.
# This means:
# - readr::read_csv() copes with commas in CSV files which appear within quotes
# Using sqldf/RSqlite to directly import CSVs doesn't do this AFAIK
# - Easy to swap readr::read_csv() with other readr methods (e.g. readr::read_fwf)
# - readr is fast.
# - readr gives nice progress bars
# - readr gives nice feedback on parsing success i.e. problems(df)
#
# Overall seems a bit faster than using Rsqlite to directly import
# - chunked conversion: 150s
# - RSqlite direct : 190s
#=========================================================================
csv2sqlite <- function(csv_file, sqlite_file, table_name, N=1000000) {
# Read first 1000 rows just to determine col_types and col_names
df <- read_csv(csv_file, n_max=1000) %>% as.data.frame
if (nrow(problems(df)) > 0) {
print(problems(df))
}
# Convert column classes to character string for read_csv()
# i.e. character, integer, integer => "cii"
col_names <- colnames(df)
type_trans <- c(character='c', numeric='d', integer='i', logical='l', Date='c')
col_types <- type_trans[sapply(df, FUN=class)]
col_types <- paste0(col_types, collapse="")
cat("col_types:", col_types, "\n")
# Connect to database. Change this line if you want to write to
# other database backends e.g. RPostgres
con <- dbConnect(RSQLite::SQLite(), dbname=sqlite_file)
# Read the data from the beginning of the CSV
# Remember to skip the header
iter <- 0
rowsread <- 0
while(TRUE) {
cat("Iteration:", iter, "\n")
# skip = iter*N + 1. the "+1" is because we can now skip the header
# The "%>% as.data.frame" is because dbWriteTable is a bit fussy on data structure
df <- read_csv(csv_file, col_names=col_names, col_types=col_types, skip=iter*N+1, n_max=N) %>% as.data.frame
if (nrow(problems(df)) > 0) {
print(problems(df))
}
if (nrow(df) == 0) {
cat("Out of rows...\n")
break
}
if ((nrow(df) == 1) & (sum(!(is.na(df))) == 0)) {
# this is a workaround for a bug in readr::read_csv when col_types is specified
# and skip > #rows in file. readr incorrectly returns a data.frame with a single row
# of all NA values.
cat("Out of rows (bugged)...\n")
break
}
rowsread <- rowsread + nrow(df)
cat("Rows read:", nrow(df), " total so far:", rowsread, "\n")
if (iter == 0) {
dbWriteTable(con, table_name, df, overwrite=TRUE)
} else {
dbWriteTable(con, table_name, df, append=TRUE)
}
iter <- iter + 1
}
cat("Total Read:", rowsread, "\n")
dbDisconnect(con)
}
#=============================================================================
# Testing csv2sqlite()
# - read the first NYC taxi data file. About 15,000,000 rows
#=============================================================================
if (FALSE) {
csv_files <- list.files(path="~/projectsdata/nyctaxi", pattern="*.csv$", full.names = TRUE)
csv_file <- csv_files[1]
sqlite_file <- "~/projectsdata/nyctaxi/nyctaxi-2.sqlite"
table_name <- "trip"
N <- 1000000
system.time({
csv2sqlite(csv_file, sqlite_file, table_name, N)
})
# user system elapsed
# 117.038 17.245 149.024
}
#=============================================================================
# Method 2: Read directly using DBI/RSqlite
# Note:
# - This method will fail if there are quoted commas in the data
# - Slightly slower than the chunked reader.
#=============================================================================
if (FALSE) {
csv_files <- list.files(path="~/projectsdata/nyctaxi", pattern="*.csv$", full.names = TRUE)
csv_file <- csv_files[1]
sqlite_file <- "~/projectsdata/nyctaxi/nyctaxi-2.sqlite"
table_name <- "trip"
con <- dbConnect(RSQLite::SQLite(), dbname = sqlite_file)
system.time({
dbWriteTable(con, name=table_name, value=csv_file, row.names=FALSE, header=TRUE, sep = ",")
})
# user system elapsed
# 167.895 9.712 192.26
dbDisconnect(con)
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment