Last active
August 29, 2015 14:19
-
-
Save vnijs/ebd23de86bd971435eca to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(DBI) | |
library(RSQLite) | |
library(readr) | |
library(dplyr) | |
#========================================================================= | |
# Read a CSV file in chunks and stuff into a database. | |
# | |
# Note: This is proof-of-concept code only. | |
# It should work OK, but there are no guarantees. | |
# There's nothing in the way of error checking. | |
# | |
# The key advantage of this method (over direct import with DBI/Rsqlite/sqldf) | |
# is that is uses readr::read_csv to parse the input file. | |
# This means: | |
# - readr::read_csv() copes with commas in CSV files which appear within quotes | |
# Using sqldf/RSqlite to directly import CSVs doesn't do this AFAIK | |
# - Easy to swap readr::read_csv() with other readr methods (e.g. readr::read_fwf) | |
# - readr is fast. | |
# - readr gives nice progress bars | |
# - readr gives nice feedback on parsing success i.e. problems(df) | |
# | |
# Overall seems a bit faster than using Rsqlite to directly import | |
# - chunked conversion: 150s | |
# - RSqlite direct : 190s | |
#========================================================================= | |
csv2sqlite <- function(csv_file, sqlite_file, table_name, N=1000000) { | |
# Read first 1000 rows just to determine col_types and col_names | |
df <- read_csv(csv_file, n_max=1000) %>% as.data.frame | |
if (nrow(problems(df)) > 0) { | |
print(problems(df)) | |
} | |
# Convert column classes to character string for read_csv() | |
# i.e. character, integer, integer => "cii" | |
col_names <- colnames(df) | |
type_trans <- c(character='c', numeric='d', integer='i', logical='l', Date='c') | |
col_types <- type_trans[sapply(df, FUN=class)] | |
col_types <- paste0(col_types, collapse="") | |
cat("col_types:", col_types, "\n") | |
# Connect to database. Change this line if you want to write to | |
# other database backends e.g. RPostgres | |
con <- dbConnect(RSQLite::SQLite(), dbname=sqlite_file) | |
# Read the data from the beginning of the CSV | |
# Remember to skip the header | |
iter <- 0 | |
rowsread <- 0 | |
while(TRUE) { | |
cat("Iteration:", iter, "\n") | |
# skip = iter*N + 1. the "+1" is because we can now skip the header | |
# The "%>% as.data.frame" is because dbWriteTable is a bit fussy on data structure | |
df <- read_csv(csv_file, col_names=col_names, col_types=col_types, skip=iter*N+1, n_max=N) %>% as.data.frame | |
if (nrow(problems(df)) > 0) { | |
print(problems(df)) | |
} | |
if (nrow(df) == 0) { | |
cat("Out of rows...\n") | |
break | |
} | |
if ((nrow(df) == 1) & (sum(!(is.na(df))) == 0)) { | |
# this is a workaround for a bug in readr::read_csv when col_types is specified | |
# and skip > #rows in file. readr incorrectly returns a data.frame with a single row | |
# of all NA values. | |
cat("Out of rows (bugged)...\n") | |
break | |
} | |
rowsread <- rowsread + nrow(df) | |
cat("Rows read:", nrow(df), " total so far:", rowsread, "\n") | |
if (iter == 0) { | |
dbWriteTable(con, table_name, df, overwrite=TRUE) | |
} else { | |
dbWriteTable(con, table_name, df, append=TRUE) | |
} | |
iter <- iter + 1 | |
} | |
cat("Total Read:", rowsread, "\n") | |
dbDisconnect(con) | |
} | |
#============================================================================= | |
# Testing csv2sqlite() | |
# - read the first NYC taxi data file. About 15,000,000 rows | |
#============================================================================= | |
if (FALSE) { | |
csv_files <- list.files(path="~/projectsdata/nyctaxi", pattern="*.csv$", full.names = TRUE) | |
csv_file <- csv_files[1] | |
sqlite_file <- "~/projectsdata/nyctaxi/nyctaxi-2.sqlite" | |
table_name <- "trip" | |
N <- 1000000 | |
system.time({ | |
csv2sqlite(csv_file, sqlite_file, table_name, N) | |
}) | |
# user system elapsed | |
# 117.038 17.245 149.024 | |
} | |
#============================================================================= | |
# Method 2: Read directly using DBI/RSqlite | |
# Note: | |
# - This method will fail if there are quoted commas in the data | |
# - Slightly slower than the chunked reader. | |
#============================================================================= | |
if (FALSE) { | |
csv_files <- list.files(path="~/projectsdata/nyctaxi", pattern="*.csv$", full.names = TRUE) | |
csv_file <- csv_files[1] | |
sqlite_file <- "~/projectsdata/nyctaxi/nyctaxi-2.sqlite" | |
table_name <- "trip" | |
con <- dbConnect(RSQLite::SQLite(), dbname = sqlite_file) | |
system.time({ | |
dbWriteTable(con, name=table_name, value=csv_file, row.names=FALSE, header=TRUE, sep = ",") | |
}) | |
# user system elapsed | |
# 167.895 9.712 192.26 | |
dbDisconnect(con) | |
} | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment