Last active
June 10, 2020 08:45
-
-
Save jaymecd/9f10bdcfa9ede1db927b09df18eb76a9 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env bash | |
# | |
# Split single S3 Inventory manifest into sequential subsets. | |
# | |
# Usage: | |
# $ env INVENTORY_BUCKET=my-inventory INVENTORY_PATH=sample-name ./s3.batch.operations.manifest.split.sh | |
# | |
set -euo pipefail | |
fatal () { | |
echo "Error: $*" >&2 | |
exit 1 | |
} | |
# inputs | |
: "${DEBUG:=}" | |
: "${INVENTORY_BUCKET:=}" # inventory S3 bucket name | |
: "${INVENTORY_PATH:=}" # inventory S3 path w/o date | |
: "${INVENTORY_DATE:=}" # inventory date (default: autodetect) | |
: "${JOBS_COUNT:=4}" # number of jobs (default: 4) | |
test -z "${DEBUG}" || set -x | |
test -n "${INVENTORY_BUCKET}" || fatal "env INVENTORY_BUCKET is empty" | |
test -n "${INVENTORY_PATH}" || fatal "env INVENTORY_PATH is empty" | |
test -n "${INVENTORY_DATE}" || { | |
# detect latest inventory date | |
INVENTORY_DATE=$(aws s3 ls "s3://${INVENTORY_BUCKET}/${INVENTORY_PATH}/" | awk '$1 == "PRE" && $2 ~ /^[0-9]{4}-.*Z\/$/ {print $2}' | sort | tail -n1 | sed -e 's/\/$//') | |
} | |
test -n "${INVENTORY_DATE}" || fatal "inventory date was not detected" | |
echo "Spit inventory manifest.json into ${JOBS_COUNT} jobs from s3://${INVENTORY_BUCKET}/${INVENTORY_PATH}/${INVENTORY_DATE}/ location?" | |
echo "Are you sure? Ctrl+C to abort..." | |
read | |
# pull origin data locally | |
aws s3 sync --delete "s3://${INVENTORY_BUCKET}/${INVENTORY_PATH}/${INVENTORY_DATE}" "${INVENTORY_DATE}" | |
# prepare for split | |
DST_DIR="${INVENTORY_DATE}-jobs-${JOBS_COUNT}" | |
rm -rf "${DST_DIR}" | |
mkdir "${DST_DIR}" | |
TOTAL_COUNT=$(jq -r ".files[].key" "${INVENTORY_DATE}/manifest.json" | wc -l) | |
CHUNK_SIZE=$(echo "a=${TOTAL_COUNT}; b=${JOBS_COUNT}; if ( a%b ) a/b+1 else a/b" | bc) | |
# do the split | |
for SEQUENCE in $(seq "${JOBS_COUNT}"); do | |
DST_MANIFEST="${DST_DIR}/manifest-${SEQUENCE}" | |
IDX_FROM=$(( (SEQUENCE - 1) * CHUNK_SIZE )) | |
IDX_TILL=$(( IDX_FROM + CHUNK_SIZE )) | |
jq ".files|=.[${IDX_FROM}:${IDX_TILL}]" "${INVENTORY_DATE}/manifest.json" > "${DST_MANIFEST}.json" | |
md5sum "${DST_MANIFEST}.json" | awk '{print $1}' > "${DST_MANIFEST}.checksum" | |
echo "$(head -n1 ${DST_MANIFEST}.checksum) ${DST_MANIFEST}.json" | md5sum -c - | |
done | |
# push splitted into custom directory | |
aws s3 sync --sse AES256 "${DST_DIR}" "s3://${INVENTORY_BUCKET}/${INVENTORY_PATH}/${DST_DIR}/" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment