Created
February 14, 2021 20:07
-
-
Save mskyttner/718dcda15c3d62895e0e808ec2ec3aca to your computer and use it in GitHub Desktop.
Bash script to bulk load a tsv file into a duckdb database file
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
# usage: ./duckdb-load.sh data.tsv duckdb tablename | |
# for example: | |
# ./duckdb-load.sh ark/hcaf_species_native.tsv duckdb_database hcaf_species_native | |
# TODO set pragma journal_mode=off or equiv settings | |
# if using .import and .sep '\t', an error appears: | |
# Error: multi-character column separators not allowed for import | |
# TODO: would like to use /dev/stdin from compressed stream, but it fails | |
# bunzip2 -d -c $1 | pv | duckdb $2 "COPY $3 FROM '/dev/stdin' ( DELIMITER '\t', HEADER FALSE);" | |
# create table based on what read_csv_auto finds in first 1000 lines of data | |
#bzip2 -d -c $1 | head -n 1001 > head.tsv | |
head -n 1001 $1 > head.tsv | |
echo -e ".mode csv\n.sep '\t'\nCREATE TABLE $3 AS SELECT * FROM read_csv_auto('head.tsv', HEADER=TRUE);" | duckdb $2 | |
rm head.tsv | |
# "reload" now that we have created the table schema, now use "insert into ... select * from" | |
echo -e ".mode csv\n.sep '\t'\nDELETE FROM $3; INSERT INTO $3 SELECT * FROM read_csv_auto('$1', HEADER=TRUE);" | duckdb $2 | |
# get rid of WAL-file | |
duckdb $2 "select count(*) from $3;\n.quit" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment