moritzheiber · January 1, 2016 04:59
diff --git a/sync-s3-bucket.rb b/sync-s3-bucket.rb
 #!/usr/bin/env ruby
 #
 # This script is intended for syncing very large S3 buckets (+1000000 objects)
 # to a local storage. I couldn't find any other tools which wouldn't crap out
 # because of the rather high number of objects to take care of (I'm looking at you,
 # s3cmd!).
 #
 # What it does:
 # - Takes a batch of #batch_size (it's 500 right now) of objects
 # - Spawns 500 threads and downloads each one individually
 # - Uses Zlib::GzipWriter to compress the downloaded object on the fly and adds a .gz extension
 # - All the while displays a progress bar of what it does (download status of current batch)
 #
 # Note: It creates directories based on the name of the objects, so if you're downloading a lot
 # of files it uses #name_offset and #name_offset_length to determine the respective names. Makes sorting
 # large, unsorted S3 buckets easier.
 #
 # It does have a few shortcomings:
 # - It does not account for connection drop-outs properly
 # - It does not handle truncated output very well (should probably just retry on the
 #   given exception)
 # - It does not check whether the file it downloads hasn't been downloaded before (i.e. it just
 #   checks whether a file with the same name as the object it's currently downloading already exists)
 #   Note: If you want to implement this you probably need some kind of hashing plus storage
 #         for those hashes and an algorithm to check for said hashes. Shouldn't be hard
 #         to implement. But this was a rather quick and dirty hack but a shiny, polished
 #         "proper" solution.
 #
 # Have fun with it. If you keep the reference I appreciate it. However, I consider it public domain.
 #
 # TL;DR: It syncs large S3 buckets and it has been written in an hour of spare time. Don't judge me.

 require 'aws-sdk'
 require 'fileutils'
 require 'ruby-progressbar'

 dump_dir = '<your-local-directory>'
 name_offset = 8
 name_offset_length = 4

 client = AWS::S3.new(
  :region => '<aws-region>',
  :access_key_id => '<access-key-id>',
  :secret_access_key => '<secret-access-key>',
  :max_retries => 20,
  :use_ssl => true  
  )

 bucket = client.buckets['<the-name-of-your-bucket>'] # You need the proper permissions for this bucket
 batch_size = 500 # You shouldn't set this too high as you're going to hit the "open file" limit pretty soon
 counter = 0

 bucket.objects.in_groups_of(batch_size) do |batch|
  progress = ProgressBar.create(
    :title => "#{counter} - #{counter+batch_size}",
    :total => batch_size
    )
  threads = []
  batch.each do |file|
    name = file.key
    threads << Thread.new(name) do |t|
      FileUtils.mkdir_p("#{dump_dir}/#{name[name_offset,name_offset_length]}")
      Zlib::GzipWriter.open("#{dump_dir}/#{name[name_offset,name_offset_length]}/#{name}.gz", 9) do |gz|
        file.read do |chunk|
          gz.write(chunk)
        end
        gz.close
      end
    end unless File.exists?("#{dump_dir}/#{name[name_offset,name_offset_length]}/#{name}.gz")
    progress.increment
  end
  threads.each {|thrd| thrd.join}
  counter += batch_size
 end
	#!/usr/bin/env ruby
	#
	# This script is intended for syncing very large S3 buckets (+1000000 objects)
	# to a local storage. I couldn't find any other tools which wouldn't crap out
	# because of the rather high number of objects to take care of (I'm looking at you,
	# s3cmd!).
	#
	# What it does:
	# - Takes a batch of #batch_size (it's 500 right now) of objects
	# - Spawns 500 threads and downloads each one individually
	# - Uses Zlib::GzipWriter to compress the downloaded object on the fly and adds a .gz extension
	# - All the while displays a progress bar of what it does (download status of current batch)
	#
	# Note: It creates directories based on the name of the objects, so if you're downloading a lot
	# of files it uses #name_offset and #name_offset_length to determine the respective names. Makes sorting
	# large, unsorted S3 buckets easier.
	#
	# It does have a few shortcomings:
	# - It does not account for connection drop-outs properly
	# - It does not handle truncated output very well (should probably just retry on the
	# given exception)
	# - It does not check whether the file it downloads hasn't been downloaded before (i.e. it just
	# checks whether a file with the same name as the object it's currently downloading already exists)
	# Note: If you want to implement this you probably need some kind of hashing plus storage
	# for those hashes and an algorithm to check for said hashes. Shouldn't be hard
	# to implement. But this was a rather quick and dirty hack but a shiny, polished
	# "proper" solution.
	#
	# Have fun with it. If you keep the reference I appreciate it. However, I consider it public domain.
	#
	# TL;DR: It syncs large S3 buckets and it has been written in an hour of spare time. Don't judge me.

	require 'aws-sdk'
	require 'fileutils'
	require 'ruby-progressbar'

	dump_dir = '<your-local-directory>'
	name_offset = 8
	name_offset_length = 4

	client = AWS::S3.new(
	:region => '<aws-region>',
	:access_key_id => '<access-key-id>',
	:secret_access_key => '<secret-access-key>',
	:max_retries => 20,
	:use_ssl => true
	)

	bucket = client.buckets['<the-name-of-your-bucket>'] # You need the proper permissions for this bucket
	batch_size = 500 # You shouldn't set this too high as you're going to hit the "open file" limit pretty soon
	counter = 0

	bucket.objects.in_groups_of(batch_size) do \|batch\|
	progress = ProgressBar.create(
	:title => "#{counter} - #{counter+batch_size}",
	:total => batch_size
	)
	threads = []
	batch.each do \|file\|
	name = file.key
	threads << Thread.new(name) do \|t\|
	FileUtils.mkdir_p("#{dump_dir}/#{name[name_offset,name_offset_length]}")
	Zlib::GzipWriter.open("#{dump_dir}/#{name[name_offset,name_offset_length]}/#{name}.gz", 9) do \|gz\|
	file.read do \|chunk\|
	gz.write(chunk)
	end
	gz.close
	end
	end unless File.exists?("#{dump_dir}/#{name[name_offset,name_offset_length]}/#{name}.gz")
	progress.increment
	end
	threads.each {\|thrd\| thrd.join}
	counter += batch_size
	end
No results found