-
-
Save anbotero/a449805e53b104ed1e8028d559d1d0b1 to your computer and use it in GitHub Desktop.
Loads a MySQL data dump into a Redshift table. Useful for AWS RDS instances. Dependent on external script to stream MySQL data to file, and postgres psql command line client.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
PYTHON_PATH=$(which python) | |
PSQL_PATH=$(which psql) | |
MYSQL_SCRIPT='mysql2file.py' | |
MYSQL_SERVER= | |
MYSQL_PORT=3306 | |
MYSQL_DATABASE= | |
MYSQL_USER= | |
MYSQL_PASSWORD= | |
TODAY=$(date '+%Y%m%d') | |
DUMP_FILENAME="errors_${TODAY}.csv" | |
DUMP_FILEPREFIX="errors_${TODAY}-" | |
DUMP_SPLITFACTOR=6 # 3 XL nodes | |
DUMP_S3BUCKET=archivedlogs | |
REDSHIFT_ENDPOINT=fqdn | |
REDSHIFT_USER=user | |
REDSHIFT_DATABASE=reports | |
REDSDHIFT_PORT=1234 | |
REDSHIFT_TABLE=errors | |
AWS_ACCESS_KEY_ID=asdf | |
AWS_SECRET_ACCESS=xcvb | |
MAINTENANCE_DAY=6 # Perform VACUUM and ANALYZE on Saturday | |
LOG_PATH="/home/${USER}/redshift_import-${TODAY}.log" | |
MYSQL_MINRANGE=$(date --date='2 days ago' '+%Y-%m-%d') | |
MYSQL_MAXRANGE=$(date --date='1 day ago' '+%Y-%m-%d') | |
MYSQL_COMMAND=" | |
SELECT | |
columns | |
FROM errors WHERE | |
WHERE time BETWEEN '${MYSQL_MINRANGE}' AND '${MYSQL_MAXRANGE}'; | |
" | |
# Dump MySQL resultset to file | |
echo "Starting MySQL dump to ${DUMP_FILENAME}" | |
${PYTHON_PATH} ${MYSQL_SCRIPT} ${MYSQL_SERVER} ${MYSQL_PORT} ${MYSQL_USER} ${MYSQL_PASSWORD} ${MYSQL_DATABASE} ${DUMP_FILENAME} ${MYSQL_COMMAND} | |
# Split by end of line, not by file size | |
echo "Splitting dump into ${DUMP_SPLITFACTOR} files" | |
split -n l/${DUMP_SPLITFACTOR} ${DUMP_FILENAME} ${DUMP_FILEPREFIX} | |
# Compress in preparation for Redshift COPY | |
echo "Compressing MySQL dump files (gzip)" | |
gzip ${DUMP_FILEPREFIX}* | |
# Upload import files to S3 | |
echo "Uploading dump files to S3 bucket ${DUMP_S3BUCKET}" | |
s3cmd put --reduced-redundancy ${DUMP_FILEPREFIX}* s3://${DUMP_S3BUCKET}/ | |
# Issue COPY command to Redshift cluster | |
REDSHIFT_COMMAND=" | |
COPY ${REDSHIFT_TABLE} FROM 's3://${DUMP_S3BUCKET}/${DUMP_FILEPREFIX}' | |
CREDENTIALS 'aws_access_key_id=${AWS_ACCESS_KEY_ID};aws_secret_access_key=${AWS_SECRET_ACCESS}' | |
GZIP | |
CSV | |
DELIMITER ',' | |
NULL AS '\N'; | |
" | |
REDSHIFT_QUERY="${PSQL_PATH} -h ${REDSHIFT_ENDPOINT} -U ${REDSHIFT_USER} -d ${REDSHIFT_DATABASE} -p ${REDSHIFT_PORT}" | |
echo "Loading dump files from S3 into Redshift table ${REDSHIFT_TABLE}" | |
${REDSHIFT_QUERY} -f ${REDSHIFT_COMMAND} > ${LOG_PATH} 2>&1 | |
if [ -f ${LOG_PATH} ] ; then | |
echo -e "Import failed when loading to Redshift\n" | |
cat ${LOG_PATH} | |
s3cmd put --reduced-redundancy ${LOG_PATH}* s3://${DUMP_S3BUCKET}/ | |
exit 1 | |
fi | |
# VACUUM and ANALYZE if we're in a maintenance window | |
if [ ${MAINTENANCE_DAY} -eq $(date '+%u') ] ; then | |
REDSHIFT_COMMAND="VACUUM SORT ONLY ${REDSHIFT_TABLE};" | |
echo "Vacuuming (resorting) data" | |
${REDSHIFT_QUERY} -f ${REDSHIFT_COMMAND} > ${LOG_PATH} 2>&1 | |
REDSHIFT_COMMAND="ANALYZE ${REDSHIFT_TABLE};" | |
echo "Analzying (regenerating statistics) of table" | |
${REDSHIFT_QUERY} -f ${REDSHIFT_COMMAND} > ${LOG_PATH} 2>&1 | |
fi | |
echo "Import complete!" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment