Skip to content

Instantly share code, notes, and snippets.

@kovid-rathee
Forked from erincerys/rds2redshift.sh
Created February 3, 2017 21:15
Show Gist options
  • Save kovid-rathee/db2123c489b684ab5594c147ceb79be2 to your computer and use it in GitHub Desktop.
Save kovid-rathee/db2123c489b684ab5594c147ceb79be2 to your computer and use it in GitHub Desktop.
Loads a MySQL data dump into a Redshift table. Useful for AWS RDS instances. Dependent on external script to stream MySQL data to file, and postgres psql command line client.
PYTHON_PATH=$(which python)
PSQL_PATH=$(which psql)
MYSQL_SCRIPT='mysql2file.py'
MYSQL_SERVER=
MYSQL_PORT=3306
MYSQL_DATABASE=
MYSQL_USER=
MYSQL_PASSWORD=
TODAY=$(date '+%Y%m%d')
DUMP_FILENAME="errors_${TODAY}.csv"
DUMP_FILEPREFIX="errors_${TODAY}-"
DUMP_SPLITFACTOR=6 # 3 XL nodes
DUMP_S3BUCKET=archivedlogs
REDSHIFT_ENDPOINT=fqdn
REDSHIFT_USER=user
REDSHIFT_DATABASE=reports
REDSDHIFT_PORT=1234
REDSHIFT_TABLE=errors
AWS_ACCESS_KEY_ID=asdf
AWS_SECRET_ACCESS=xcvb
MAINTENANCE_DAY=6 # Perform VACUUM and ANALYZE on Saturday
LOG_PATH="/home/${USER}/redshift_import-${TODAY}.log"
MYSQL_MINRANGE=$(date --date='2 days ago' '+%Y-%m-%d')
MYSQL_MAXRANGE=$(date --date='1 day ago' '+%Y-%m-%d')
MYSQL_COMMAND="
SELECT
columns
FROM errors WHERE
WHERE time BETWEEN '${MYSQL_MINRANGE}' AND '${MYSQL_MAXRANGE}';
"
# Dump MySQL resultset to file
echo "Starting MySQL dump to ${DUMP_FILENAME}"
${PYTHON_PATH} ${MYSQL_SCRIPT} ${MYSQL_SERVER} ${MYSQL_PORT} ${MYSQL_USER} ${MYSQL_PASSWORD} ${MYSQL_DATABASE} ${DUMP_FILENAME} ${MYSQL_COMMAND}
# Split by end of line, not by file size
echo "Splitting dump into ${DUMP_SPLITFACTOR} files"
split -n l/${DUMP_SPLITFACTOR} ${DUMP_FILENAME} ${DUMP_FILEPREFIX}
# Compress in preparation for Redshift COPY
echo "Compressing MySQL dump files (gzip)"
gzip ${DUMP_FILEPREFIX}*
# Upload import files to S3
echo "Uploading dump files to S3 bucket ${DUMP_S3BUCKET}"
s3cmd put --reduced-redundancy ${DUMP_FILEPREFIX}* s3://${DUMP_S3BUCKET}/
# Issue COPY command to Redshift cluster
REDSHIFT_COMMAND="
COPY ${REDSHIFT_TABLE} FROM 's3://${DUMP_S3BUCKET}/${DUMP_FILEPREFIX}'
CREDENTIALS 'aws_access_key_id=${AWS_ACCESS_KEY_ID};aws_secret_access_key=${AWS_SECRET_ACCESS}'
GZIP
CSV
DELIMITER ','
NULL AS '\N';
"
REDSHIFT_QUERY="${PSQL_PATH} -h ${REDSHIFT_ENDPOINT} -U ${REDSHIFT_USER} -d ${REDSHIFT_DATABASE} -p ${REDSHIFT_PORT}"
echo "Loading dump files from S3 into Redshift table ${REDSHIFT_TABLE}"
${REDSHIFT_QUERY} -f ${REDSHIFT_COMMAND} > ${LOG_PATH} 2>&1
if [ -f ${LOG_PATH} ] ; then
echo -e "Import failed when loading to Redshift\n"
cat ${LOG_PATH}
s3cmd put --reduced-redundancy ${LOG_PATH}* s3://${DUMP_S3BUCKET}/
exit 1
fi
# VACUUM and ANALYZE if we're in a maintenance window
if [ ${MAINTENANCE_DAY} -eq $(date '+%u') ] ; then
REDSHIFT_COMMAND="VACUUM SORT ONLY ${REDSHIFT_TABLE};"
echo "Vacuuming (resorting) data"
${REDSHIFT_QUERY} -f ${REDSHIFT_COMMAND} > ${LOG_PATH} 2>&1
REDSHIFT_COMMAND="ANALYZE ${REDSHIFT_TABLE};"
echo "Analzying (regenerating statistics) of table"
${REDSHIFT_QUERY} -f ${REDSHIFT_COMMAND} > ${LOG_PATH} 2>&1
fi
echo "Import complete!"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment