Last active
December 19, 2022 03:31
-
-
Save cmerrick/a01008aa883bc8ef5578 to your computer and use it in GitHub Desktop.
Better Know a Database - Redshift Load Data Formats
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/sh | |
# OS: Ubuntu 14.01 | |
# Generate data - produces about 200GB on disk, takes a while | |
DATADIR=tpch-dbgen | |
SCALE=1000 | |
git clone https://github.com/electrum/tpch-dbgen.git | |
cd $DATADIR && make && ./dbgen -f -v -C 16 -S 1 -s $SCALE && cd - | |
# Convert CSV to JSON | |
awk -f lineitem-tojson.awk $DATADIR/lineitem.tbl.1 > $DATADIR/lineitem.1.json | |
# Convert JSON to avro | |
wget http://apache.mirrors.pair.com/avro/avro-1.7.7/java/avro-tools-1.7.7.jar -O avro-tools.jar | |
java -jar avro-tools.jar fromjson --schema-file lineitem.avsc $DATADIR/lineitem.1.json > $DATADIR/lineitem.1.avro | |
echo "CSV, JSON and avro files created locally. Use \`aws s3 cp ...\` to upload them to s3" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
# OS: Ubuntu 14.01 | |
# Requires: aws jq and psql in PATH | |
CLUSTER_NAME=$1 | |
CLUSTER_PASS=$2 | |
SUBNET_GROUP_NAME=$3 | |
VPC_SEC_GROUP_ID=$4 | |
FORMAT=$5 # "AVRO 'auto'", "JSON 'auto'", or "delimiter '|'" | |
S3_FILENAME=$6 | |
AWS_KEY="FAKE" | |
AWS_SECRET="FAKE" | |
RESPONSE=$(aws redshift create-cluster \ | |
--db-name benchmark \ | |
--cluster-type single-node \ | |
--node-type dc1.large \ | |
--master-username root \ | |
--master-user-password $CLUSTER_PASS \ | |
--cluster-subnet-group-name $SUBNET_GROUP_NAME \ | |
--vpc-security-group-ids $VPC_SEC_GROUP_ID \ | |
--publicly-accessible \ | |
--cluster-identifier $CLUSTER_NAME) | |
STATUS='creating' | |
while [ "$STATUS" == 'creating' ] | |
do | |
sleep 3 | |
STATUS=$(aws redshift describe-clusters --cluster-identifier $CLUSTER_NAME | jq -r '.Clusters | .[0] | .ClusterStatus') | |
done | |
DB_HOST=$(aws redshift describe-clusters --cluster-identifier $CLUSTER_NAME | jq -r '.Clusters | .[0] | .Endpoint | .Address') | |
export PGPASSWORD=$CLUSTER_PASS | |
psql postgresql://root@$DB_HOST:5439/benchmark -f lineitem.create.sql | |
echo "COPYing Data..." | |
time psql postgresql://root@$DB_HOST:5439/benchmark -c "copy lineitem from '$S3_FILENAME' credentials 'aws_access_key_id=$AWS_KEY;aws_secret_access_key=$AWS_SECRET' FORMAT AS $FORMAT;" | |
echo "COPY Finished" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
BEGIN {FS = "|"} | |
{print "{" \ | |
"\"l_orderkey\": " $1 ", " \ | |
"\"l_partkey\": " $2 ", " \ | |
"\"l_suppkey\": " $3 ", " \ | |
"\"l_linenumber\": " $4 ", " \ | |
"\"l_quantity\": " $5 ", " \ | |
"\"l_extendedprice\": " $6 ", " \ | |
"\"l_discount\": " $7 ", " \ | |
"\"l_tax\": " $8 ", " \ | |
"\"l_returnflag\": \"" $9 "\", " \ | |
"\"l_linestatus\": \"" $10 "\", " \ | |
"\"l_shipdate\": \"" $11 "\", " \ | |
"\"l_commitdate\": \"" $12 "\", " \ | |
"\"l_receiptdate\": \"" $13 "\", " \ | |
"\"l_shipinstruct\": \"" $14 "\", " \ | |
"\"l_shipmode\": \"" $15 "\", " \ | |
"\"l_comment\": \"" $16 "\"" \ | |
"}" | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{"namespace": "rjm.db-benchmark", | |
"type": "record", | |
"name": "lineitem", | |
"fields": [ | |
{"name": "l_orderkey", "type": "int"}, | |
{"name": "l_partkey", "type": "int"}, | |
{"name": "l_suppkey", "type": "int"}, | |
{"name": "l_linenumber", "type": "int"}, | |
{"name": "l_quantity", "type": "float"}, | |
{"name": "l_extendedprice", "type": "float"}, | |
{"name": "l_discount", "type": "float"}, | |
{"name": "l_tax", "type": "float"}, | |
{"name": "l_returnflag", "type": "string"}, | |
{"name": "l_linestatus", "type": "string"}, | |
{"name": "l_shipdate", "type": "string"}, | |
{"name": "l_commitdate", "type": "string"}, | |
{"name": "l_receiptdate", "type": "string"}, | |
{"name": "l_shipinstruct", "type": "string"}, | |
{"name": "l_shipmode", "type": "string"}, | |
{"name": "l_comment", "type": "string"} | |
] | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
CREATE TABLE lineitem ( | |
l_orderkey INTEGER NOT NULL, L_PARTKEY INTEGER NOT NULL, | |
l_suppkey INTEGER NOT NULL, | |
l_linenumber INTEGER NOT NULL, | |
l_quantity DECIMAL(15,2) NOT NULL, | |
l_extendedprice DECIMAL(15,2) NOT NULL, | |
l_discount DECIMAL(15,2) NOT NULL, | |
l_tax DECIMAL(15,2) NOT NULL, | |
l_returnflag CHAR(1) NOT NULL, | |
l_linestatus CHAR(1) NOT NULL, | |
l_shipdate DATE NOT NULL, | |
l_commitdate DATE NOT NULL, | |
l_receiptdate DATE NOT NULL, | |
l_shipinstruct CHAR(25) NOT NULL, | |
l_shipmode CHAR(10) NOT NULL, | |
l_comment VARCHAR(44) NOT NULL | |
); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment