wcmatthysen · June 4, 2024 07:35
diff --git a/compute_etag.sh b/compute_etag.sh
 #!/bin/bash
 #
 # Calculate checksum corresponding to the entity-tag hash (ETag) of Amazon S3 objects
 # 
 # Usage: compute_etag.sh <filename> <part_size_mb>
 #
 # filename: file to process
 # part_size_mb: chunk size in MiB used for multipart uploads.  
 # This is 8M by default for the AWS CLI See:
 # https://docs.aws.amazon.com/cli/latest/topic/s3-config.html#multipart_chunksize
 #
 # The Etag for an S3 object can be obtained from the command-line using:
 # aws s3api head-object --bucket <bucket-name> --key <key-name> --query ETag --output text
 # Note that the Etag may or may not correspond to the MD5 digest, see here for details:
 # https://docs.aws.amazon.com/AmazonS3/latest/API/API_Object.html

 # Adapted from: https://gist.github.com/emersonf/7413337
 # Changes
 # 7/23/2022
 # - Parallelized hash calculation
 # - Removed need for temporary files

 # Adapted from: https://gist.github.com/rajivnarayan/1a8e5f2b6783701e0b3717dbcfd324ba
 # Changes
 # 6/4/2024
 # - Added handling for 10,000+ parts

 # Script requires: dd, md5sum, xxd

 set -euo pipefail

 NUM_PARALLEL=$(nproc)

 # Converts a size in MB to number of bytes.
 byte_to_mb(){
  echo $(($1 * 1024 * 1024))
 }

 # Minimum filesize in bytes to switch to multipart uploads
 # https://docs.aws.amazon.com/cli/latest/topic/s3-config.html#multipart-threshold
 MULTIPART_MINSIZE=$(byte_to_mb 8)

 if [[ $# -ne 2 ]]; then
   echo "Usage: $0 file partSizeInMb";
   exit 0;
 fi
 file="$1"
 partSizeInMb=$2

 if [[ ! -f "$file" ]]; then
   echo "Error: $file not found." 
   exit 1;
 fi

 # Calculate checksum for a specified file chunk
 # inputs: file, partSizeInMb, chunk
 # output: chunk md5sum
 hash_chunk(){
   local file="$1"
   local partSizeInB="$2"
   local chunk="$3"
   # output chunk + md5 (to allow sorting later)
   dd bs="$partSizeInB" count="1" skip="$chunk" if="$file" 2> /dev/null | echo -e "$chunk $(md5sum)"
 }

 # Returns the max value of the two supplied integers.
 max() {
  echo $(($1 > $2 ? $1 : $2))
 }

 # Integer quotient a/b after rounding up 
 div_round_up(){
   echo $((($1 + $2 - 1)/$2))
 }

 partSizeInB=$(byte_to_mb partSizeInMb)
 fileSizeInB=$(du -b "$file" | cut -f1 )
 minPartSizeInB=$(div_round_up fileSizeInB 10000)
 finalPartSizeInB=$(max minPartSizeInB partSizeInB)
 parts=$(div_round_up fileSizeInB finalPartSizeInB)

 if [[ $fileSizeInB -gt $MULTIPART_MINSIZE ]]; then
   export -f hash_chunk
   etag=$(seq 0 $((parts-1)) | \
       xargs -P ${NUM_PARALLEL} -I{} bash -c 'hash_chunk "$@"' -- "$file" "$finalPartSizeInB" {} | \
       sort -n -k1,1 |tr -s ' '|cut -f2,3 -d' '|xxd -r -p|md5sum|cut -f1 -d' ')"-$parts"
 else
   etag=$(md5sum "$file"|cut -f1 -d' ')
 fi

 echo -e "${file}\t${etag}\t${finalPartSizeInB}"
	#!/bin/bash
	#
	# Calculate checksum corresponding to the entity-tag hash (ETag) of Amazon S3 objects
	#
	# Usage: compute_etag.sh <filename> <part_size_mb>
	#
	# filename: file to process
	# part_size_mb: chunk size in MiB used for multipart uploads.
	# This is 8M by default for the AWS CLI See:
	# https://docs.aws.amazon.com/cli/latest/topic/s3-config.html#multipart_chunksize
	#
	# The Etag for an S3 object can be obtained from the command-line using:
	# aws s3api head-object --bucket <bucket-name> --key <key-name> --query ETag --output text
	# Note that the Etag may or may not correspond to the MD5 digest, see here for details:
	# https://docs.aws.amazon.com/AmazonS3/latest/API/API_Object.html

	# Adapted from: https://gist.github.com/emersonf/7413337
	# Changes
	# 7/23/2022
	# - Parallelized hash calculation
	# - Removed need for temporary files

	# Adapted from: https://gist.github.com/rajivnarayan/1a8e5f2b6783701e0b3717dbcfd324ba
	# Changes
	# 6/4/2024
	# - Added handling for 10,000+ parts

	# Script requires: dd, md5sum, xxd

	set -euo pipefail

	NUM_PARALLEL=$(nproc)

	# Converts a size in MB to number of bytes.
	byte_to_mb(){
	echo $(($1 * 1024 * 1024))
	}

	# Minimum filesize in bytes to switch to multipart uploads
	# https://docs.aws.amazon.com/cli/latest/topic/s3-config.html#multipart-threshold
	MULTIPART_MINSIZE=$(byte_to_mb 8)

	if [[ $# -ne 2 ]]; then
	echo "Usage: $0 file partSizeInMb";
	exit 0;
	fi
	file="$1"
	partSizeInMb=$2

	if [[ ! -f "$file" ]]; then
	echo "Error: $file not found."
	exit 1;
	fi

	# Calculate checksum for a specified file chunk
	# inputs: file, partSizeInMb, chunk
	# output: chunk md5sum
	hash_chunk(){
	local file="$1"
	local partSizeInB="$2"
	local chunk="$3"
	# output chunk + md5 (to allow sorting later)
	dd bs="$partSizeInB" count="1" skip="$chunk" if="$file" 2> /dev/null \| echo -e "$chunk $(md5sum)"
	}

	# Returns the max value of the two supplied integers.
	max() {
	echo $(($1 > $2 ? $1 : $2))
	}

	# Integer quotient a/b after rounding up
	div_round_up(){
	echo $((($1 + $2 - 1)/$2))
	}

	partSizeInB=$(byte_to_mb partSizeInMb)
	fileSizeInB=$(du -b "$file" \| cut -f1 )
	minPartSizeInB=$(div_round_up fileSizeInB 10000)
	finalPartSizeInB=$(max minPartSizeInB partSizeInB)
	parts=$(div_round_up fileSizeInB finalPartSizeInB)

	if [[ $fileSizeInB -gt $MULTIPART_MINSIZE ]]; then
	export -f hash_chunk
	etag=$(seq 0 $((parts-1)) \| \
	xargs -P ${NUM_PARALLEL} -I{} bash -c 'hash_chunk "$@"' -- "$file" "$finalPartSizeInB" {} \| \
	sort -n -k1,1 \|tr -s ' '\|cut -f2,3 -d' '\|xxd -r -p\|md5sum\|cut -f1 -d' ')"-$parts"
	else
	etag=$(md5sum "$file"\|cut -f1 -d' ')
	fi

	echo -e "${file}\t${etag}\t${finalPartSizeInB}"