Last active
June 3, 2017 18:28
-
-
Save missinglink/62ae04e52210c686348af749e7a08a1d to your computer and use it in GitHub Desktop.
parallel import data in to sqlite from stdin using all available cores
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| PRAGMA foreign_keys=OFF; | |
| PRAGMA page_size=4096; | |
| PRAGMA cache_size=-2000; | |
| PRAGMA synchronous=OFF; | |
| PRAGMA journal_mode=OFF; | |
| PRAGMA temp_store=MEMORY; | |
| CREATE TABLE data( | |
| lower TEXT NOT NULL, | |
| upper TEXT NOT NULL | |
| ); | |
| .mode csv data | |
| .import /dev/stdin data |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| lower | upper | |
|---|---|---|
| a | A | |
| b | B | |
| c | C | |
| d | D | |
| e | E | |
| f | F | |
| g | G | |
| h | H | |
| i | I | |
| j | J | |
| k | K | |
| l | L |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/bin/bash | |
| echo "$(date) ------- $(basename $0) -------"; | |
| export DIR=$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd ); | |
| export LC_ALL=en_US.UTF-8; | |
| DB=${DB:-"${DIR}/merged.db"}; | |
| export IN_FILE=${IN_FILE:-"${DIR}/input.csv"}; | |
| export INIT_FILE=${INIT_FILE:-"${DIR}/init.sql"}; | |
| export TMPDIR=${TMPDIR:-'/tmp'}; | |
| OFFSET=${OFFSET:-2}; # skip header | |
| RECORDS_PER_SHARD=${RECORDS_PER_SHARD:-1000}; | |
| TOTAL_LINES=$( wc -l < ${IN_FILE} ); | |
| TOTAL_SHARDS=$((( ( ${TOTAL_LINES} - ${OFFSET} ) / ${RECORDS_PER_SHARD} ) )); | |
| function import { | |
| awk "NR==${2}, NR==${3}" "${IN_FILE}" | sqlite3 --init "${INIT_FILE}" "${TMPDIR}/shard.${1}.db"; | |
| } | |
| export -f import; | |
| # import data in to shards | |
| for SHARD in $(seq 0 ${TOTAL_SHARDS}); do | |
| START=$((( ${SHARD} * ${RECORDS_PER_SHARD} ) + ${OFFSET} )); | |
| END=$(( ${START} + ${RECORDS_PER_SHARD} -1 )); | |
| parallel --no-notice --jobs +0 import ::: ${SHARD} ::: ${START} ::: ${END}; | |
| done | |
| # init db | |
| rm -f "${DB}"; # reset | |
| echo '' | sqlite3 --init "${INIT_FILE}" "${DB}" 2>/dev/null; | |
| # merge shards | |
| for SHARD in $(seq 0 ${TOTAL_SHARDS}); do | |
| SHARD_DB="${TMPDIR}/shard.${SHARD}.db"; | |
| sqlite3 "${DB}" <<SQL | |
| PRAGMA foreign_keys=OFF; | |
| PRAGMA page_size=4096; | |
| PRAGMA cache_size=-2000; | |
| PRAGMA synchronous=OFF; | |
| PRAGMA journal_mode=OFF; | |
| PRAGMA temp_store=MEMORY; | |
| PRAGMA threads=8; | |
| ATTACH '${SHARD_DB}' AS shard; | |
| PRAGMA shard.query_only=ON; | |
| PRAGMA main.query_only=OFF; | |
| INSERT INTO main.data SELECT * FROM shard.data; | |
| SQL | |
| rm -f "${SHARD_DB}"; # remove shard | |
| done | |
| # sqlite3 "${DB}" 'SELECT * FROM data LIMIT 100'; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment