tanema · March 4, 2022 09:23 · simpleshadow · Mar 23, 2017 · lspellman · Jan 3, 2018
diff --git a/multiprocess_migration.sh b/multiprocess_migration.sh
 #! /bin/bash
 ###################### USAGE ######################################
 usage() {
  echo "
 Usage: mongotos3 [-t n] mongo_host mongo_collection s3_bucket
  -t                : number of parallel processes to use
  mongo_host        : the host of the mongodb server
  mongo_collection  : the collection to collecthe gridfs data from
  s3_bucket         : the name of the bucket you want to cp the files to
  "
 }
 ###################### END USAGE ##################################
 # how many times to split up the list
 thread_count=8
 # parrallel process pid array
 _worker_pids=()
 # incremented variable to see progress 
 _current_file=1 

 # get options just -t for setting how many threads you want
 while getopts 't:*:' opt; do
  case $opt in
    t) thread_count=$OPTARG;;
    *) 
      usage
      exit
      ;;
  esac
 done
 shift $((OPTIND-1))

 # script params
 if [ "$#" -ne 3 ]
 then
  usage
 fi
 # mongo host
 _host="${1:?Mongo Host Required}"
 # mongo collection to pull grid_fs data from
 _db="${2:?Mongo Collection required}"
 # s3 bucket for everything to be synced to
 _bucket="${3:?AWS Bucket Required}"
 # all the files
 _files_list=$(mongofiles -h $_host -db $_db list)
 # total files to be synced
 _total_files=$(echo "$_files_list" | wc -l | awk {'print $1'})
 # how many lines to send to each thread
 ((lines_per_file=(_total_files + thread_count - 1) / thread_count))
 ###################### LOGGING ####################################
 RED=$(tput setaf 1)
 GREEN=$(tput setaf 2)
 NORMAL=$(tput sgr0)
 log_ok() {
  let COL=$(tput cols)-${#1}+${#GREEN}+${#NORMAL}
  printf "%s%${COL}s" "$1" "$GREEN[OK]$NORMAL"
 }

 log_fail() {
  let COL=$(tput cols)-${#1}+${#RED}+${#NORMAL}
  printf "%s%${COL}s" "$1" "$RED[FAIL]$NORMAL"
 }
 ###################### END LOGGING ################################
 ###################### METHOD DEFINITIONS #########################
 # param $1: filepath from mongo
 # param $2: worker identity number
 syncfile () {
  status="(worker $2) $_current_file/$lines_per_file $_bucket/$1"
  ((_current_file++))

  #check if file is already on the server
  file_count=$((0+$(aws s3 ls $_bucket/$1 | wc -l)))
  if [[ $file_count -gt 0 ]]; then
    log_ok "$status Already on server"
  else
    filename="_migration-$_current_file-$(uuidgen)"
    #get file from gridfs and create a temp file of it
    mongofiles -h $_host -db $_db get --local $filename $1 > /dev/null 2>&1
    #get file succeeded
    if [ $? -eq 0 ]; then
      #send it to s3
      aws s3 cp $filename s3://$_bucket/$1 --dryrun --quiet
      #send file status and if this file migration succeeded
      if [ $? -eq 0 ]; then
        log_ok "$status"
      else
        log_fail "$status"
      fi

      #rm temp file gotten from gridfs
      rm $filename
    else
      log_fail "$status Get from db failed"
    fi
  fi
 }

 # param: $1 worker identity number
 # param: $2 starting line number in the file to process
 process_lines () {
  while read -r line; do
    #get filename
    file=$(echo "$line" | awk -F'\t' '{ print $1 }')
    #if connected message then continue
    [[ $file == 'connected to'* ]] && continue
    # sync the file with the server
    syncfile $file $1
  done < <(echo "$_files_list" | head -n $(($2 + $lines_per_file)) | tail -n $lines_per_file)
 }

 # used for kill signals
 # calls kill on each pid
 kill_all_workers () {
  echo 'killing all workers'
  for ((i=0; i <= ${#_worker_pids[@]}; ++i)); do
    kill -6 ${_worker_pids[i]} > /dev/null 2>&1
  done
  echo 'migration aborted'
  #cleanup any files that were interrupted
  rm _migration-* > /dev/null 2>&1
 }
 ###################### END METHOD DEFINITIONS #####################

 #allows ctrl c to work in the while loop
 trap "kill_all_workers" SIGINT SIGHUP SIGTERM

 for ((i=0; i < $thread_count; ++i)); do
  echo "starting worker $i"
  #call process on this chunk of files
  process_lines $i $((lines_per_file * i)) &
  #record the pid for cleanup and waiting 
  _worker_pids+=($!)
 done

 #wait for each process to finish
 for ((i=0; i <= ${#_worker_pids[@]}; ++i)); do
  wait ${_worker_pids[i]} > /dev/null 2>&1
 done

 #if no errors say we are complete
 if [ $? -eq 0 ]; then
  echo DONE
 fi
	#! /bin/bash
	###################### USAGE ######################################
	usage() {
	echo "
	Usage: mongotos3 [-t n] mongo_host mongo_collection s3_bucket
	-t : number of parallel processes to use
	mongo_host : the host of the mongodb server
	mongo_collection : the collection to collecthe gridfs data from
	s3_bucket : the name of the bucket you want to cp the files to
	"
	}
	###################### END USAGE ##################################
	# how many times to split up the list
	thread_count=8
	# parrallel process pid array
	_worker_pids=()
	# incremented variable to see progress
	_current_file=1

	# get options just -t for setting how many threads you want
	while getopts 't:*:' opt; do
	case $opt in
	t) thread_count=$OPTARG;;
	*)
	usage
	exit
	;;
	esac
	done
	shift $((OPTIND-1))

	# script params
	if [ "$#" -ne 3 ]
	then
	usage
	fi
	# mongo host
	_host="${1:?Mongo Host Required}"
	# mongo collection to pull grid_fs data from
	_db="${2:?Mongo Collection required}"
	# s3 bucket for everything to be synced to
	_bucket="${3:?AWS Bucket Required}"
	# all the files
	_files_list=$(mongofiles -h $_host -db $_db list)
	# total files to be synced
	_total_files=$(echo "$_files_list" \| wc -l \| awk {'print $1'})
	# how many lines to send to each thread
	((lines_per_file=(_total_files + thread_count - 1) / thread_count))
	###################### LOGGING ####################################
	RED=$(tput setaf 1)
	GREEN=$(tput setaf 2)
	NORMAL=$(tput sgr0)
	log_ok() {
	let COL=$(tput cols)-${#1}+${#GREEN}+${#NORMAL}
	printf "%s%${COL}s" "$1" "$GREEN[OK]$NORMAL"
	}

	log_fail() {
	let COL=$(tput cols)-${#1}+${#RED}+${#NORMAL}
	printf "%s%${COL}s" "$1" "$RED[FAIL]$NORMAL"
	}
	###################### END LOGGING ################################
	###################### METHOD DEFINITIONS #########################
	# param $1: filepath from mongo
	# param $2: worker identity number
	syncfile () {
	status="(worker $2) $_current_file/$lines_per_file $_bucket/$1"
	((_current_file++))

	#check if file is already on the server
	file_count=$((0+$(aws s3 ls $_bucket/$1 \| wc -l)))
	if [[ $file_count -gt 0 ]]; then
	log_ok "$status Already on server"
	else
	filename="_migration-$_current_file-$(uuidgen)"
	#get file from gridfs and create a temp file of it
	mongofiles -h $_host -db $_db get --local $filename $1 > /dev/null 2>&1
	#get file succeeded
	if [ $? -eq 0 ]; then
	#send it to s3
	aws s3 cp $filename s3://$_bucket/$1 --dryrun --quiet
	#send file status and if this file migration succeeded
	if [ $? -eq 0 ]; then
	log_ok "$status"
	else
	log_fail "$status"
	fi

	#rm temp file gotten from gridfs
	rm $filename
	else
	log_fail "$status Get from db failed"
	fi
	fi
	}

	# param: $1 worker identity number
	# param: $2 starting line number in the file to process
	process_lines () {
	while read -r line; do
	#get filename
	file=$(echo "$line" \| awk -F'\t' '{ print $1 }')
	#if connected message then continue
	[[ $file == 'connected to'* ]] && continue
	# sync the file with the server
	syncfile $file $1
	done < <(echo "$_files_list" \| head -n $(($2 + $lines_per_file)) \| tail -n $lines_per_file)
	}

	# used for kill signals
	# calls kill on each pid
	kill_all_workers () {
	echo 'killing all workers'
	for ((i=0; i <= ${#_worker_pids[@]}; ++i)); do
	kill -6 ${_worker_pids[i]} > /dev/null 2>&1
	done
	echo 'migration aborted'
	#cleanup any files that were interrupted
	rm _migration-* > /dev/null 2>&1
	}
	###################### END METHOD DEFINITIONS #####################

	#allows ctrl c to work in the while loop
	trap "kill_all_workers" SIGINT SIGHUP SIGTERM

	for ((i=0; i < $thread_count; ++i)); do
	echo "starting worker $i"
	#call process on this chunk of files
	process_lines $i $((lines_per_file * i)) &
	#record the pid for cleanup and waiting
	_worker_pids+=($!)
	done

	#wait for each process to finish
	for ((i=0; i <= ${#_worker_pids[@]}; ++i)); do
	wait ${_worker_pids[i]} > /dev/null 2>&1
	done

	#if no errors say we are complete
	if [ $? -eq 0 ]; then
	echo DONE
	fi