-
-
Save wcmatthysen/c1b6a88c99a2ab5eb1dbbd25bee7561a to your computer and use it in GitHub Desktop.
Calculate checksum corresponding to the entity-tag hash (ETag) of Amazon S3 objects
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
# | |
# Calculate checksum corresponding to the entity-tag hash (ETag) of Amazon S3 objects | |
# | |
# Usage: compute_etag.sh <filename> <part_size_mb> | |
# | |
# filename: file to process | |
# part_size_mb: chunk size in MiB used for multipart uploads. | |
# This is 8M by default for the AWS CLI See: | |
# https://docs.aws.amazon.com/cli/latest/topic/s3-config.html#multipart_chunksize | |
# | |
# The Etag for an S3 object can be obtained from the command-line using: | |
# aws s3api head-object --bucket <bucket-name> --key <key-name> --query ETag --output text | |
# Note that the Etag may or may not correspond to the MD5 digest, see here for details: | |
# https://docs.aws.amazon.com/AmazonS3/latest/API/API_Object.html | |
# Adapted from: https://gist.github.com/emersonf/7413337 | |
# Changes | |
# 7/23/2022 | |
# - Parallelized hash calculation | |
# - Removed need for temporary files | |
# Adapted from: https://gist.github.com/rajivnarayan/1a8e5f2b6783701e0b3717dbcfd324ba | |
# Changes | |
# 6/4/2024 | |
# - Added handling for 10,000+ parts | |
# Script requires: dd, md5sum, xxd | |
set -euo pipefail | |
NUM_PARALLEL=$(nproc) | |
# Converts a size in MB to number of bytes. | |
byte_to_mb(){ | |
echo $(($1 * 1024 * 1024)) | |
} | |
# Minimum filesize in bytes to switch to multipart uploads | |
# https://docs.aws.amazon.com/cli/latest/topic/s3-config.html#multipart-threshold | |
MULTIPART_MINSIZE=$(byte_to_mb 8) | |
if [[ $# -ne 2 ]]; then | |
echo "Usage: $0 file partSizeInMb"; | |
exit 0; | |
fi | |
file="$1" | |
partSizeInMb=$2 | |
if [[ ! -f "$file" ]]; then | |
echo "Error: $file not found." | |
exit 1; | |
fi | |
# Calculate checksum for a specified file chunk | |
# inputs: file, partSizeInMb, chunk | |
# output: chunk md5sum | |
hash_chunk(){ | |
local file="$1" | |
local partSizeInB="$2" | |
local chunk="$3" | |
# output chunk + md5 (to allow sorting later) | |
dd bs="$partSizeInB" count="1" skip="$chunk" if="$file" 2> /dev/null | echo -e "$chunk $(md5sum)" | |
} | |
# Returns the max value of the two supplied integers. | |
max() { | |
echo $(($1 > $2 ? $1 : $2)) | |
} | |
# Integer quotient a/b after rounding up | |
div_round_up(){ | |
echo $((($1 + $2 - 1)/$2)) | |
} | |
partSizeInB=$(byte_to_mb partSizeInMb) | |
fileSizeInB=$(du -b "$file" | cut -f1 ) | |
minPartSizeInB=$(div_round_up fileSizeInB 10000) | |
finalPartSizeInB=$(max minPartSizeInB partSizeInB) | |
parts=$(div_round_up fileSizeInB finalPartSizeInB) | |
if [[ $fileSizeInB -gt $MULTIPART_MINSIZE ]]; then | |
export -f hash_chunk | |
etag=$(seq 0 $((parts-1)) | \ | |
xargs -P ${NUM_PARALLEL} -I{} bash -c 'hash_chunk "$@"' -- "$file" "$finalPartSizeInB" {} | \ | |
sort -n -k1,1 |tr -s ' '|cut -f2,3 -d' '|xxd -r -p|md5sum|cut -f1 -d' ')"-$parts" | |
else | |
etag=$(md5sum "$file"|cut -f1 -d' ') | |
fi | |
echo -e "${file}\t${etag}\t${finalPartSizeInB}" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment