Last active
March 4, 2022 09:23
-
-
Save tanema/2c752d3c9725c7ffea94 to your computer and use it in GitHub Desktop.
migrate files from gridfs to aws s3
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#! /bin/bash | |
###################### USAGE ###################################### | |
usage() { | |
echo " | |
Usage: mongotos3 [-t n] mongo_host mongo_collection s3_bucket | |
-t : number of parallel processes to use | |
mongo_host : the host of the mongodb server | |
mongo_collection : the collection to collecthe gridfs data from | |
s3_bucket : the name of the bucket you want to cp the files to | |
" | |
} | |
###################### END USAGE ################################## | |
# how many times to split up the list | |
thread_count=8 | |
# parrallel process pid array | |
_worker_pids=() | |
# incremented variable to see progress | |
_current_file=1 | |
# get options just -t for setting how many threads you want | |
while getopts 't:*:' opt; do | |
case $opt in | |
t) thread_count=$OPTARG;; | |
*) | |
usage | |
exit | |
;; | |
esac | |
done | |
shift $((OPTIND-1)) | |
# script params | |
if [ "$#" -ne 3 ] | |
then | |
usage | |
fi | |
# mongo host | |
_host="${1:?Mongo Host Required}" | |
# mongo collection to pull grid_fs data from | |
_db="${2:?Mongo Collection required}" | |
# s3 bucket for everything to be synced to | |
_bucket="${3:?AWS Bucket Required}" | |
# all the files | |
_files_list=$(mongofiles -h $_host -db $_db list) | |
# total files to be synced | |
_total_files=$(echo "$_files_list" | wc -l | awk {'print $1'}) | |
# how many lines to send to each thread | |
((lines_per_file=(_total_files + thread_count - 1) / thread_count)) | |
###################### LOGGING #################################### | |
RED=$(tput setaf 1) | |
GREEN=$(tput setaf 2) | |
NORMAL=$(tput sgr0) | |
log_ok() { | |
let COL=$(tput cols)-${#1}+${#GREEN}+${#NORMAL} | |
printf "%s%${COL}s" "$1" "$GREEN[OK]$NORMAL" | |
} | |
log_fail() { | |
let COL=$(tput cols)-${#1}+${#RED}+${#NORMAL} | |
printf "%s%${COL}s" "$1" "$RED[FAIL]$NORMAL" | |
} | |
###################### END LOGGING ################################ | |
###################### METHOD DEFINITIONS ######################### | |
# param $1: filepath from mongo | |
# param $2: worker identity number | |
syncfile () { | |
status="(worker $2) $_current_file/$lines_per_file $_bucket/$1" | |
((_current_file++)) | |
#check if file is already on the server | |
file_count=$((0+$(aws s3 ls $_bucket/$1 | wc -l))) | |
if [[ $file_count -gt 0 ]]; then | |
log_ok "$status Already on server" | |
else | |
filename="_migration-$_current_file-$(uuidgen)" | |
#get file from gridfs and create a temp file of it | |
mongofiles -h $_host -db $_db get --local $filename $1 > /dev/null 2>&1 | |
#get file succeeded | |
if [ $? -eq 0 ]; then | |
#send it to s3 | |
aws s3 cp $filename s3://$_bucket/$1 --dryrun --quiet | |
#send file status and if this file migration succeeded | |
if [ $? -eq 0 ]; then | |
log_ok "$status" | |
else | |
log_fail "$status" | |
fi | |
#rm temp file gotten from gridfs | |
rm $filename | |
else | |
log_fail "$status Get from db failed" | |
fi | |
fi | |
} | |
# param: $1 worker identity number | |
# param: $2 starting line number in the file to process | |
process_lines () { | |
while read -r line; do | |
#get filename | |
file=$(echo "$line" | awk -F'\t' '{ print $1 }') | |
#if connected message then continue | |
[[ $file == 'connected to'* ]] && continue | |
# sync the file with the server | |
syncfile $file $1 | |
done < <(echo "$_files_list" | head -n $(($2 + $lines_per_file)) | tail -n $lines_per_file) | |
} | |
# used for kill signals | |
# calls kill on each pid | |
kill_all_workers () { | |
echo 'killing all workers' | |
for ((i=0; i <= ${#_worker_pids[@]}; ++i)); do | |
kill -6 ${_worker_pids[i]} > /dev/null 2>&1 | |
done | |
echo 'migration aborted' | |
#cleanup any files that were interrupted | |
rm _migration-* > /dev/null 2>&1 | |
} | |
###################### END METHOD DEFINITIONS ##################### | |
#allows ctrl c to work in the while loop | |
trap "kill_all_workers" SIGINT SIGHUP SIGTERM | |
for ((i=0; i < $thread_count; ++i)); do | |
echo "starting worker $i" | |
#call process on this chunk of files | |
process_lines $i $((lines_per_file * i)) & | |
#record the pid for cleanup and waiting | |
_worker_pids+=($!) | |
done | |
#wait for each process to finish | |
for ((i=0; i <= ${#_worker_pids[@]}; ++i)); do | |
wait ${_worker_pids[i]} > /dev/null 2>&1 | |
done | |
#if no errors say we are complete | |
if [ $? -eq 0 ]; then | |
echo DONE | |
fi |
Thank you, this was very useful. I had to make some mods as well, primarily dealing with the fact that my filenames had spaces in them. When syncfile routine passes file over as an argument, the name gets split up and the second part of the filename becomes argument 2, etc... Basically putting quotes around $1 in most places takes care of it.
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Thanks! Just successfully used this (with a few minor updates to the
mongofiles
options for latest version).