vnijs · August 29, 2015 14:19
diff --git a/csv2sqlite.R b/csv2sqlite.R
 library(DBI)
 library(RSQLite)
 library(readr)
 library(dplyr)

 #=========================================================================
 # Read a CSV file in chunks and stuff into a database.
 #
 # Note: This is proof-of-concept code only.
 #       It should work OK, but there are no guarantees.
 #       There's nothing in the way of error checking.
 #
 # The key advantage of this method (over direct import with DBI/Rsqlite/sqldf)
 # is that is uses readr::read_csv to parse the input file.
 # This means:
 #   -  readr::read_csv() copes with commas in CSV files which appear within quotes
 #      Using sqldf/RSqlite to directly import CSVs doesn't do this AFAIK
 #   -  Easy to swap readr::read_csv() with other readr methods (e.g. readr::read_fwf)
 #   -  readr is fast.
 #   -  readr gives nice progress bars
 #   -  readr gives nice feedback on parsing success i.e. problems(df)
 #
 # Overall seems a bit faster than using Rsqlite to directly import
 #   - chunked conversion:  150s
 #   - RSqlite direct    :  190s
 #=========================================================================
 csv2sqlite <- function(csv_file, sqlite_file, table_name, N=1000000) {
    # Read first 1000 rows just to determine col_types and col_names
    df <- read_csv(csv_file, n_max=1000) %>% as.data.frame
    if (nrow(problems(df)) > 0) {
        print(problems(df))
    }
    # Convert column classes to character string for read_csv()
    # i.e. character, integer, integer => "cii"
    col_names <- colnames(df)
    type_trans <- c(character='c', numeric='d', integer='i', logical='l', Date='c')
    col_types <- type_trans[sapply(df, FUN=class)]
    col_types <- paste0(col_types, collapse="")
    cat("col_types:", col_types, "\n")

    # Connect to database. Change this line if you want to write to
    # other database backends e.g. RPostgres
    con <- dbConnect(RSQLite::SQLite(), dbname=sqlite_file)

    # Read the data from the beginning of the CSV
    # Remember to skip the header
    iter <- 0
    rowsread <- 0
    while(TRUE) {
        cat("Iteration:", iter, "\n")
        # skip = iter*N + 1.  the "+1" is because we can now skip the header
        # The "%>% as.data.frame" is because dbWriteTable is a bit fussy on data structure
        df <- read_csv(csv_file, col_names=col_names, col_types=col_types, skip=iter*N+1, n_max=N) %>% as.data.frame
        if (nrow(problems(df)) > 0) {
            print(problems(df))
        }
        if (nrow(df) == 0) {
            cat("Out of rows...\n")
            break
        }
        if ((nrow(df) == 1) & (sum(!(is.na(df))) == 0)) {
            # this is a workaround for a bug in readr::read_csv when col_types is specified
            # and skip > #rows in file.  readr incorrectly returns a data.frame with a single row
            # of all NA values.
            cat("Out of rows (bugged)...\n")
            break
        }
        rowsread <- rowsread + nrow(df)
        cat("Rows read:", nrow(df), "  total so far:", rowsread, "\n")
        if (iter == 0) {
            dbWriteTable(con, table_name, df, overwrite=TRUE)
        } else {
            dbWriteTable(con, table_name, df, append=TRUE)
        }
        iter <- iter + 1
    }
    cat("Total Read:", rowsread, "\n")

    dbDisconnect(con)
 }

 #=============================================================================
 # Testing csv2sqlite()
 #   - read the first NYC taxi data file. About 15,000,000 rows
 #=============================================================================
 if (FALSE) {
    csv_files   <- list.files(path="~/projectsdata/nyctaxi", pattern="*.csv$", full.names = TRUE)
    csv_file    <- csv_files[1]
    sqlite_file <- "~/projectsdata/nyctaxi/nyctaxi-2.sqlite"
    table_name  <- "trip"
    N           <- 1000000

    system.time({
        csv2sqlite(csv_file, sqlite_file, table_name, N)
    })
    #     user  system elapsed
    #  117.038  17.245 149.024
 }


 #=============================================================================
 # Method 2:  Read directly using DBI/RSqlite
 # Note:
 #   - This method will fail if there are quoted commas in the data
 #   - Slightly slower than the chunked reader.
 #=============================================================================
 if (FALSE) {
    csv_files   <- list.files(path="~/projectsdata/nyctaxi", pattern="*.csv$", full.names = TRUE)
    csv_file    <- csv_files[1]
    sqlite_file <- "~/projectsdata/nyctaxi/nyctaxi-2.sqlite"
    table_name  <- "trip"

    con <- dbConnect(RSQLite::SQLite(), dbname = sqlite_file)
    system.time({
        dbWriteTable(con, name=table_name, value=csv_file, row.names=FALSE, header=TRUE, sep = ",")
    })

    #      user  system elapsed
    #   167.895   9.712 192.26

    dbDisconnect(con)
 }
	library(DBI)
	library(RSQLite)
	library(readr)
	library(dplyr)

	#=========================================================================
	# Read a CSV file in chunks and stuff into a database.
	#
	# Note: This is proof-of-concept code only.
	# It should work OK, but there are no guarantees.
	# There's nothing in the way of error checking.
	#
	# The key advantage of this method (over direct import with DBI/Rsqlite/sqldf)
	# is that is uses readr::read_csv to parse the input file.
	# This means:
	# - readr::read_csv() copes with commas in CSV files which appear within quotes
	# Using sqldf/RSqlite to directly import CSVs doesn't do this AFAIK
	# - Easy to swap readr::read_csv() with other readr methods (e.g. readr::read_fwf)
	# - readr is fast.
	# - readr gives nice progress bars
	# - readr gives nice feedback on parsing success i.e. problems(df)
	#
	# Overall seems a bit faster than using Rsqlite to directly import
	# - chunked conversion: 150s
	# - RSqlite direct : 190s
	#=========================================================================
	csv2sqlite <- function(csv_file, sqlite_file, table_name, N=1000000) {
	# Read first 1000 rows just to determine col_types and col_names
	df <- read_csv(csv_file, n_max=1000) %>% as.data.frame
	if (nrow(problems(df)) > 0) {
	print(problems(df))
	}
	# Convert column classes to character string for read_csv()
	# i.e. character, integer, integer => "cii"
	col_names <- colnames(df)
	type_trans <- c(character='c', numeric='d', integer='i', logical='l', Date='c')
	col_types <- type_trans[sapply(df, FUN=class)]
	col_types <- paste0(col_types, collapse="")
	cat("col_types:", col_types, "\n")

	# Connect to database. Change this line if you want to write to
	# other database backends e.g. RPostgres
	con <- dbConnect(RSQLite::SQLite(), dbname=sqlite_file)

	# Read the data from the beginning of the CSV
	# Remember to skip the header
	iter <- 0
	rowsread <- 0
	while(TRUE) {
	cat("Iteration:", iter, "\n")
	# skip = iter*N + 1. the "+1" is because we can now skip the header
	# The "%>% as.data.frame" is because dbWriteTable is a bit fussy on data structure
	df <- read_csv(csv_file, col_names=col_names, col_types=col_types, skip=iter*N+1, n_max=N) %>% as.data.frame
	if (nrow(problems(df)) > 0) {
	print(problems(df))
	}
	if (nrow(df) == 0) {
	cat("Out of rows...\n")
	break
	}
	if ((nrow(df) == 1) & (sum(!(is.na(df))) == 0)) {
	# this is a workaround for a bug in readr::read_csv when col_types is specified
	# and skip > #rows in file. readr incorrectly returns a data.frame with a single row
	# of all NA values.
	cat("Out of rows (bugged)...\n")
	break
	}
	rowsread <- rowsread + nrow(df)
	cat("Rows read:", nrow(df), " total so far:", rowsread, "\n")
	if (iter == 0) {
	dbWriteTable(con, table_name, df, overwrite=TRUE)
	} else {
	dbWriteTable(con, table_name, df, append=TRUE)
	}
	iter <- iter + 1
	}
	cat("Total Read:", rowsread, "\n")

	dbDisconnect(con)
	}

	#=============================================================================
	# Testing csv2sqlite()
	# - read the first NYC taxi data file. About 15,000,000 rows
	#=============================================================================
	if (FALSE) {
	csv_files <- list.files(path="~/projectsdata/nyctaxi", pattern="*.csv$", full.names = TRUE)
	csv_file <- csv_files[1]
	sqlite_file <- "~/projectsdata/nyctaxi/nyctaxi-2.sqlite"
	table_name <- "trip"
	N <- 1000000

	system.time({
	csv2sqlite(csv_file, sqlite_file, table_name, N)
	})
	# user system elapsed
	# 117.038 17.245 149.024
	}


	#=============================================================================
	# Method 2: Read directly using DBI/RSqlite
	# Note:
	# - This method will fail if there are quoted commas in the data
	# - Slightly slower than the chunked reader.
	#=============================================================================
	if (FALSE) {
	csv_files <- list.files(path="~/projectsdata/nyctaxi", pattern="*.csv$", full.names = TRUE)
	csv_file <- csv_files[1]
	sqlite_file <- "~/projectsdata/nyctaxi/nyctaxi-2.sqlite"
	table_name <- "trip"

	con <- dbConnect(RSQLite::SQLite(), dbname = sqlite_file)
	system.time({
	dbWriteTable(con, name=table_name, value=csv_file, row.names=FALSE, header=TRUE, sep = ",")
	})

	# user system elapsed
	# 167.895 9.712 192.26

	dbDisconnect(con)
	}