Last active
November 10, 2019 16:02
-
-
Save trietsch/087d145a60e43f9acb2e565c7b787eb1 to your computer and use it in GitHub Desktop.
Parallelize s3 copy / move actions by using screens and forked processes (usage: ./s3-parallel <from> <to> <files_to_copy_per_screen>
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
action=${1:?Please specify whether to mv or cp.} | |
from=${2:?Specify the full s3 path to move or copy from.} | |
to=${3:?Specify the full s3 path to move or copy to.} | |
number_of_screens=${4:-10} | |
case $action in | |
mv) | |
echo "Going to MOVE data from '$from' to '$to'." | |
;; | |
cp) | |
echo "Going to COPY data from '$from' to '$to'." | |
;; | |
*) | |
echo "Unknown action $action. Specify whether to mv or cp." | |
exit 1 | |
;; | |
esac | |
# Ensure s3 paths ends with a slash | |
function fix_path() { | |
path_to_fix="$1" | |
case "$path_to_fix" in | |
*/) | |
;; | |
*) | |
path_to_fix="$path_to_fix/" | |
;; | |
esac | |
echo $path_to_fix | |
} | |
# Set the date command, macOS depends on coreutils (brew install coreutils) | |
if [[ "$OSTYPE" == "darwin"* ]]; then | |
# macOS | |
date_command="gdate" | |
else | |
date_command="date" | |
fi | |
from=$(fix_path "$from") | |
to=$(fix_path "$to") | |
files_to_copy=$(aws s3 ls "$from" | awk '{print $4}') | |
number_of_files=$(echo "$files_to_copy" | wc -l | sed 's/ //g') | |
files_per_screen=$((($number_of_files + $number_of_screens - 1) / $number_of_screens)) | |
index=0 | |
files_in_batch=0 | |
screen_batch_index=0 | |
screen_batch_name_prefix="s3-parallel-$action-" | |
for file in $files_to_copy; do | |
old_file_path="${from}${file}" | |
new_file_path="${to}${file}" | |
files_in_batch=$(($files_in_batch+1)) | |
commands="${commands} aws s3 $action \"$old_file_path\" \"$new_file_path\"; " | |
index=$(($index+1)) | |
if [[ ("$(($index % $files_per_screen))" -eq 0) || ($index -eq $number_of_files) ]]; then | |
current_epoch=$(eval "$date_command +%s%N") | |
screen_name="${screen_batch_name_prefix}${screen_batch_index}-${current_epoch}" | |
echo "Starting new screen '$screen_name' with $files_in_batch item(s) to $action." | |
screen -S "$screen_name" -d -m bash -c "eval \"$commands\" wait" | |
commands="" | |
screen_batch_index=$(($screen_batch_index+1)) | |
files_in_batch=0 | |
fi | |
done | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment