Skip to content

Instantly share code, notes, and snippets.

@missinglink
Last active June 3, 2017 18:28
Show Gist options
  • Select an option

  • Save missinglink/62ae04e52210c686348af749e7a08a1d to your computer and use it in GitHub Desktop.

Select an option

Save missinglink/62ae04e52210c686348af749e7a08a1d to your computer and use it in GitHub Desktop.
parallel import data in to sqlite from stdin using all available cores
PRAGMA foreign_keys=OFF;
PRAGMA page_size=4096;
PRAGMA cache_size=-2000;
PRAGMA synchronous=OFF;
PRAGMA journal_mode=OFF;
PRAGMA temp_store=MEMORY;
CREATE TABLE data(
lower TEXT NOT NULL,
upper TEXT NOT NULL
);
.mode csv data
.import /dev/stdin data
lower upper
a A
b B
c C
d D
e E
f F
g G
h H
i I
j J
k K
l L
#!/bin/bash
echo "$(date) ------- $(basename $0) -------";
export DIR=$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd );
export LC_ALL=en_US.UTF-8;
DB=${DB:-"${DIR}/merged.db"};
export IN_FILE=${IN_FILE:-"${DIR}/input.csv"};
export INIT_FILE=${INIT_FILE:-"${DIR}/init.sql"};
export TMPDIR=${TMPDIR:-'/tmp'};
OFFSET=${OFFSET:-2}; # skip header
RECORDS_PER_SHARD=${RECORDS_PER_SHARD:-1000};
TOTAL_LINES=$( wc -l < ${IN_FILE} );
TOTAL_SHARDS=$((( ( ${TOTAL_LINES} - ${OFFSET} ) / ${RECORDS_PER_SHARD} ) ));
function import {
awk "NR==${2}, NR==${3}" "${IN_FILE}" | sqlite3 --init "${INIT_FILE}" "${TMPDIR}/shard.${1}.db";
}
export -f import;
# import data in to shards
for SHARD in $(seq 0 ${TOTAL_SHARDS}); do
START=$((( ${SHARD} * ${RECORDS_PER_SHARD} ) + ${OFFSET} ));
END=$(( ${START} + ${RECORDS_PER_SHARD} -1 ));
parallel --no-notice --jobs +0 import ::: ${SHARD} ::: ${START} ::: ${END};
done
# init db
rm -f "${DB}"; # reset
echo '' | sqlite3 --init "${INIT_FILE}" "${DB}" 2>/dev/null;
# merge shards
for SHARD in $(seq 0 ${TOTAL_SHARDS}); do
SHARD_DB="${TMPDIR}/shard.${SHARD}.db";
sqlite3 "${DB}" <<SQL
PRAGMA foreign_keys=OFF;
PRAGMA page_size=4096;
PRAGMA cache_size=-2000;
PRAGMA synchronous=OFF;
PRAGMA journal_mode=OFF;
PRAGMA temp_store=MEMORY;
PRAGMA threads=8;
ATTACH '${SHARD_DB}' AS shard;
PRAGMA shard.query_only=ON;
PRAGMA main.query_only=OFF;
INSERT INTO main.data SELECT * FROM shard.data;
SQL
rm -f "${SHARD_DB}"; # remove shard
done
# sqlite3 "${DB}" 'SELECT * FROM data LIMIT 100';
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment