han-tun · September 23, 2020 23:59
diff --git a/2020-09-22_LA-RUG_Uncomfortable-data.R b/2020-09-22_LA-RUG_Uncomfortable-data.R
 #' Uncomfortable data
 #' LA R Users Group
 #' 2020-09-22
 #' Edward Visel

 #' Setup
 library(tidyverse)


 path <- '/tmp/nycflights13/flights.csv'
 dir.create(dirname(path))
 data.table::fwrite(nycflights13::flights, file = path)


 #####################################
 ### Demo 1: Subsetting strategies ###
 #####################################

 #' 1. Read then subset
 flights <- read_csv(path)
 small_1 <- flights %>% select(carrier, origin, dest, year, month, day, dep_time)

 #' 2. Subset on read
 small_2d <- data.table::fread(path, select = c('carrier', 'origin', 'dest', 'year', 'month', 'day', 'dep_time'))
 small_2t <- read_csv(path, col_types = cols_only(
    carrier = col_guess(),
    origin = col_guess(),
    dest = col_guess(),
    year = col_guess(),
    month = col_guess(),
    day = col_guess(),
    dep_time = col_guess()
 ))

 #' 3. Batch, subset, and aggregate

 ## Unknown file size
 # setup
 batch_size = 10000L
 n = batch_size
 offset = 0L
 col_names <- names(read.csv(path, nrows = 1))
 df_list <- list()

 # iterate
 while (n == 10000L) {
    # read
    message('Reading rows from ', offset, ' up to ', offset + batch_size)
    df <- read.csv(path, skip = offset, nrows = batch_size, col.names = col_names, stringsAsFactors = FALSE)

    # subset and append
    df_list[[length(df_list) + 1L]] <- df[c('carrier', 'origin', 'dest', 'year', 'month', 'day', 'dep_time')]

    # increment
    n <- nrow(df)
    offset <- offset + batch_size
 }

 # aggregate
 small_3b <- do.call(rbind, df_list)


 ## Precalculating file size
 batch_size <- 10000L
 col_names <- names(read_csv(path, n_max = 0))
 n_lines <- scan(
    text = system(paste('wc -l', path), intern = TRUE),
    what = integer(1),
    nmax = 1
 )
 offsets <- seq(1L, n_lines, by = batch_size)

 small_3t <- map_dfr(offsets, function(offset){
    message('Reading rows ', offset, ' to ', min(offset + batch_size, n_lines))

    suppressMessages(
        read_csv(path, skip = offset, col_names = col_names, n_max = batch_size)
    ) %>%
        select(carrier, origin, dest, year, month, day, dep_time)
 })


 ############################
 ### Demo 2: Storing data ###
 ############################

 flights_dir <- '/tmp/flights'
 data('flights', package = 'nycflights13')

 flights %>%
    group_by(carrier, year, month) %>%
    nest() %>%
    mutate(
        filepath = file.path(flights_dir, carrier, year, sprintf('%02s.parquet', month))
    ) %>%
    pwalk(function(...) {
        dots <- list(...)
        if (!dir.exists(dirname(dots$filepath))) {
            dir.create(dirname(dots$filepath), recursive = TRUE)
        }

        arrow::write_parquet(
            dots$data, dots$filepath,
            version = '1.0', chunk_size = 10000L,
            compression = 'snappy'
        )
    })

 list.dirs(flights_dir)

 list.files(file.path(flights_dir, "AA", "2013"), full.names = TRUE)

 system('tree /tmp/flights')


 ##############################
 ### Demo 3: Arrow Datasets ###
 ##############################

 #' Single-file CSV Arrow Dataset

 list.files('/tmp/nycflights13/')
 flights_ds_c <- arrow::open_dataset('/tmp/nycflights13/', format = 'csv')

 flights_ds_c
 flights_ds_c$files
 flights_ds_c$format
 flights_ds_c$num_cols
 flights_ds_c$num_rows

 flights_ds_c %>%
    select(carrier, origin, dest, year, month, day, dep_time)

 flights_ds_c %>%
    select(carrier, origin, dest, year, month, day, dep_time) %>%
    collect()

 flights_ds_c %>%
    select(carrier, origin, dest) %>%
    filter(dest == 'LAX') %>%
    collect()


 #' Multi-file, multi-directory Parquet Arrow Dataset

 flights_ds_p <- arrow::open_dataset(flights_dir, partitioning = c('carrier', 'year'))

 flights_ds_p
 flights_ds_p$files
 flights_ds_p$format
 flights_ds_p$num_cols
 flights_ds_p$num_rows

 flights_ds_p %>%
    select(carrier, origin, dest, matches('time$')) %>%
    collect()

 flights_ds_p %>%
    filter(carrier == 'UA', distance > 1000, !is.na(arr_delay)) %>%
    collect()


 #' Limitations

 flights_ds_p %>%
    group_by(carrier) %>%    # works
    filter(n() > 10000)    # most expressions beyond inequality like `n()` are not supported

 flights_ds_p %>%
    # mutation (in `mutate`, `transmute`, or `group_by`) not supported
    mutate(arr_time = hms::hms(hours = arr_time %/% 100L, minutes = arr_time %% 100L))

 flights_ds_p %>%
    group_by(carrier) %>%
    summarise(mean_arr_delay = mean(arr_delay))    # `summarise()` not supported


 #' Demo 3 is in the accompanying shell script


 ########################
 ### Demo 4: sergeant ###
 ########################

 library(sergeant)

 #' DBI interface
 dbi_con <- dbConnect(Drill())
 DBI::dbGetQuery(dbi_con, 'SELECT * FROM dfs.tmp.`flights` LIMIT 3')

 #' REST interface
 rest_con <- drill_connection()
 drill_query(rest_con, 'SELECT * FROM dfs.tmp.`flights` LIMIT 3')

 #' dbplyr interface
 tbl(dbi_con, 'dfs.tmp.`flights`')
 #' or
 drill <- src_drill()
 tbl(drill, 'dfs.tmp.`flights`')


 #' SQL translation
 ?drill_custom_functions
 sql_translate_env(dbi_con)

 flights_drill <- tbl(drill, 'dfs.tmp.`flights`') %>%
    mutate(
        carrier = dir0,
        year = dir1,
        # as.integer is translated; LEFT is passed through to Drill
        month = as.integer(LEFT(filename, 2L))
    )

 flights_drill
 flights_drill %>% show_query()

 flights_drill %>% count()
 flights_drill %>% count(carrier = dir0, sort = TRUE)
 flights_drill %>% count(carrier = dir0, sort = TRUE) %>% collect()

 flights_drill %>%
    filter(dest %in% c('LAX', 'ONT', 'SNA', 'BUR', 'LGB')) %>%
    group_by(carrier, origin, dest) %>%
    summarise(
        mean_arr_delay = mean(arr_delay),
        sd_arr_delay = sd(arr_delay),
        max_arr_delay = max(arr_delay),
        n = n()
    ) %>%
    arrange(mean_arr_delay) %>%
    collect()
diff --git a/2020-09-22_LA-RUG_Uncomfortable-data.sh b/2020-09-22_LA-RUG_Uncomfortable-data.sh
 #!/bin/sh

 ############################
 ### Demo 4: Apache Drill ###
 ############################

 # Installation
 brew info apache-drill
 # brew install apache-drill


 # Start Drill
 drill-embedded

 -- See Drill commands
 !help

 -- See workspaces
 SHOW SCHEMAS;

 SELECT *
 -- built-in dataset
 FROM cp.`employee.json`
 LIMIT 3;

 -- real CSV we wrote
 SELECT *
 FROM dfs.tmp.`nycflights13/flights.csv`
 LIMIT 1;

 -- subset columns and rows
 SELECT carrier, flight, tailnum
 FROM dfs.tmp.`nycflights13/flights.csv`
 WHERE distance > 1000
 LIMIT 10;

 -- grouped operations! aggregation! sorting!
 SELECT carrier, count(*) as n
 FROM dfs.tmp.`nycflights13/flights.csv`
 GROUP BY carrier
 ORDER BY n DESC;

 SELECT carrier, count(*) as n
 FROM dfs.tmp.`nycflights13/flights.csv`
 WHERE distance < 100
 GROUP BY carrier
 ORDER BY n DESC;

 -- navigate
 SHOW FILES
 FROM dfs.tmp.`flights`;

 SHOW FILES
 FROM dfs.tmp.`flights/AA/2013/`;

 -- query a whole directory of files with globbing
 SELECT *
 FROM dfs.tmp.`flights/AA/*/*.parquet`
 LIMIT 3;

 SELECT *
 FROM dfs.tmp.`flights/*`
 LIMIT 3;

 -- implicit columns
 SELECT DISTINCT
    FQN,
    FILEPATH,
    FILENAME,
    SUFFIX
 -- if path refers to a directory, sucks up everything inside
 FROM dfs.tmp.`flights`
 LIMIT 10;

 -- calculate useful summary stats and aggregations
 SELECT
    -- extract carrier from subdirectory
    dir0 AS carrier,
    -- use any ANSI SQL functions
    ROUND(AVG(dep_delay), 2) AS mean_dep_delay,
    ROUND(AVG(arr_delay), 2) AS mean_arr_delay,
    count(*) AS n
 FROM dfs.tmp.`flights`
 GROUP BY carrier
 ORDER BY n DESC;

 SELECT
    dir1 AS year,
    -- extract month from filename
    LEFT(FILENAME, 2) AS month,
    ROUND(AVG(arr_delay), 2) AS mean_arr_delay,
    count(*) AS n
 FROM dfs.tmp.`flights/*`
 WHERE dir0 = 'UA'
 GROUP BY year, month
 ORDER BY year, month;

 -- how to quit Drill shell. !exit also works.
 !quit
	#' Uncomfortable data
	#' LA R Users Group
	#' 2020-09-22
	#' Edward Visel

	#' Setup
	library(tidyverse)


	path <- '/tmp/nycflights13/flights.csv'
	dir.create(dirname(path))
	data.table::fwrite(nycflights13::flights, file = path)


	#####################################
	### Demo 1: Subsetting strategies ###
	#####################################

	#' 1. Read then subset
	flights <- read_csv(path)
	small_1 <- flights %>% select(carrier, origin, dest, year, month, day, dep_time)

	#' 2. Subset on read
	small_2d <- data.table::fread(path, select = c('carrier', 'origin', 'dest', 'year', 'month', 'day', 'dep_time'))
	small_2t <- read_csv(path, col_types = cols_only(
	carrier = col_guess(),
	origin = col_guess(),
	dest = col_guess(),
	year = col_guess(),
	month = col_guess(),
	day = col_guess(),
	dep_time = col_guess()
	))

	#' 3. Batch, subset, and aggregate

	## Unknown file size
	# setup
	batch_size = 10000L
	n = batch_size
	offset = 0L
	col_names <- names(read.csv(path, nrows = 1))
	df_list <- list()

	# iterate
	while (n == 10000L) {
	# read
	message('Reading rows from ', offset, ' up to ', offset + batch_size)
	df <- read.csv(path, skip = offset, nrows = batch_size, col.names = col_names, stringsAsFactors = FALSE)

	# subset and append
	df_list[[length(df_list) + 1L]] <- df[c('carrier', 'origin', 'dest', 'year', 'month', 'day', 'dep_time')]

	# increment
	n <- nrow(df)
	offset <- offset + batch_size
	}

	# aggregate
	small_3b <- do.call(rbind, df_list)


	## Precalculating file size
	batch_size <- 10000L
	col_names <- names(read_csv(path, n_max = 0))
	n_lines <- scan(
	text = system(paste('wc -l', path), intern = TRUE),
	what = integer(1),
	nmax = 1
	)
	offsets <- seq(1L, n_lines, by = batch_size)

	small_3t <- map_dfr(offsets, function(offset){
	message('Reading rows ', offset, ' to ', min(offset + batch_size, n_lines))

	suppressMessages(
	read_csv(path, skip = offset, col_names = col_names, n_max = batch_size)
	) %>%
	select(carrier, origin, dest, year, month, day, dep_time)
	})


	############################
	### Demo 2: Storing data ###
	############################

	flights_dir <- '/tmp/flights'
	data('flights', package = 'nycflights13')

	flights %>%
	group_by(carrier, year, month) %>%
	nest() %>%
	mutate(
	filepath = file.path(flights_dir, carrier, year, sprintf('%02s.parquet', month))
	) %>%
	pwalk(function(...) {
	dots <- list(...)
	if (!dir.exists(dirname(dots$filepath))) {
	dir.create(dirname(dots$filepath), recursive = TRUE)
	}

	arrow::write_parquet(
	dots$data, dots$filepath,
	version = '1.0', chunk_size = 10000L,
	compression = 'snappy'
	)
	})

	list.dirs(flights_dir)

	list.files(file.path(flights_dir, "AA", "2013"), full.names = TRUE)

	system('tree /tmp/flights')


	##############################
	### Demo 3: Arrow Datasets ###
	##############################

	#' Single-file CSV Arrow Dataset

	list.files('/tmp/nycflights13/')
	flights_ds_c <- arrow::open_dataset('/tmp/nycflights13/', format = 'csv')

	flights_ds_c
	flights_ds_c$files
	flights_ds_c$format
	flights_ds_c$num_cols
	flights_ds_c$num_rows

	flights_ds_c %>%
	select(carrier, origin, dest, year, month, day, dep_time)

	flights_ds_c %>%
	select(carrier, origin, dest, year, month, day, dep_time) %>%
	collect()

	flights_ds_c %>%
	select(carrier, origin, dest) %>%
	filter(dest == 'LAX') %>%
	collect()


	#' Multi-file, multi-directory Parquet Arrow Dataset

	flights_ds_p <- arrow::open_dataset(flights_dir, partitioning = c('carrier', 'year'))

	flights_ds_p
	flights_ds_p$files
	flights_ds_p$format
	flights_ds_p$num_cols
	flights_ds_p$num_rows

	flights_ds_p %>%
	select(carrier, origin, dest, matches('time$')) %>%
	collect()

	flights_ds_p %>%
	filter(carrier == 'UA', distance > 1000, !is.na(arr_delay)) %>%
	collect()


	#' Limitations

	flights_ds_p %>%
	group_by(carrier) %>% # works
	filter(n() > 10000) # most expressions beyond inequality like `n()` are not supported

	flights_ds_p %>%
	# mutation (in `mutate`, `transmute`, or `group_by`) not supported
	mutate(arr_time = hms::hms(hours = arr_time %/% 100L, minutes = arr_time %% 100L))

	flights_ds_p %>%
	group_by(carrier) %>%
	summarise(mean_arr_delay = mean(arr_delay)) # `summarise()` not supported


	#' Demo 3 is in the accompanying shell script


	########################
	### Demo 4: sergeant ###
	########################

	library(sergeant)

	#' DBI interface
	dbi_con <- dbConnect(Drill())
	DBI::dbGetQuery(dbi_con, 'SELECT * FROM dfs.tmp.`flights` LIMIT 3')

	#' REST interface
	rest_con <- drill_connection()
	drill_query(rest_con, 'SELECT * FROM dfs.tmp.`flights` LIMIT 3')

	#' dbplyr interface
	tbl(dbi_con, 'dfs.tmp.`flights`')
	#' or
	drill <- src_drill()
	tbl(drill, 'dfs.tmp.`flights`')


	#' SQL translation
	?drill_custom_functions
	sql_translate_env(dbi_con)

	flights_drill <- tbl(drill, 'dfs.tmp.`flights`') %>%
	mutate(
	carrier = dir0,
	year = dir1,
	# as.integer is translated; LEFT is passed through to Drill
	month = as.integer(LEFT(filename, 2L))
	)

	flights_drill
	flights_drill %>% show_query()

	flights_drill %>% count()
	flights_drill %>% count(carrier = dir0, sort = TRUE)
	flights_drill %>% count(carrier = dir0, sort = TRUE) %>% collect()

	flights_drill %>%
	filter(dest %in% c('LAX', 'ONT', 'SNA', 'BUR', 'LGB')) %>%
	group_by(carrier, origin, dest) %>%
	summarise(
	mean_arr_delay = mean(arr_delay),
	sd_arr_delay = sd(arr_delay),
	max_arr_delay = max(arr_delay),
	n = n()
	) %>%
	arrange(mean_arr_delay) %>%
	collect()
	#!/bin/sh

	############################
	### Demo 4: Apache Drill ###
	############################

	# Installation
	brew info apache-drill
	# brew install apache-drill


	# Start Drill
	drill-embedded

	-- See Drill commands
	!help

	-- See workspaces
	SHOW SCHEMAS;

	SELECT *
	-- built-in dataset
	FROM cp.`employee.json`
	LIMIT 3;

	-- real CSV we wrote
	SELECT *
	FROM dfs.tmp.`nycflights13/flights.csv`
	LIMIT 1;

	-- subset columns and rows
	SELECT carrier, flight, tailnum
	FROM dfs.tmp.`nycflights13/flights.csv`
	WHERE distance > 1000
	LIMIT 10;

	-- grouped operations! aggregation! sorting!
	SELECT carrier, count(*) as n
	FROM dfs.tmp.`nycflights13/flights.csv`
	GROUP BY carrier
	ORDER BY n DESC;

	SELECT carrier, count(*) as n
	FROM dfs.tmp.`nycflights13/flights.csv`
	WHERE distance < 100
	GROUP BY carrier
	ORDER BY n DESC;

	-- navigate
	SHOW FILES
	FROM dfs.tmp.`flights`;

	SHOW FILES
	FROM dfs.tmp.`flights/AA/2013/`;

	-- query a whole directory of files with globbing
	SELECT *
	FROM dfs.tmp.`flights/AA//.parquet`
	LIMIT 3;

	SELECT *
	FROM dfs.tmp.`flights/*`
	LIMIT 3;

	-- implicit columns
	SELECT DISTINCT
	FQN,
	FILEPATH,
	FILENAME,
	SUFFIX
	-- if path refers to a directory, sucks up everything inside
	FROM dfs.tmp.`flights`
	LIMIT 10;

	-- calculate useful summary stats and aggregations
	SELECT
	-- extract carrier from subdirectory
	dir0 AS carrier,
	-- use any ANSI SQL functions
	ROUND(AVG(dep_delay), 2) AS mean_dep_delay,
	ROUND(AVG(arr_delay), 2) AS mean_arr_delay,
	count(*) AS n
	FROM dfs.tmp.`flights`
	GROUP BY carrier
	ORDER BY n DESC;

	SELECT
	dir1 AS year,
	-- extract month from filename
	LEFT(FILENAME, 2) AS month,
	ROUND(AVG(arr_delay), 2) AS mean_arr_delay,
	count(*) AS n
	FROM dfs.tmp.`flights/*`
	WHERE dir0 = 'UA'
	GROUP BY year, month
	ORDER BY year, month;

	-- how to quit Drill shell. !exit also works.
	!quit