Last active
August 29, 2015 13:57
-
-
Save krakatoa/9842376 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env ruby | |
# | |
# Testing multipart uploads into s3 with threads | |
# Tested with Ruby 1.8 and 1.9 | |
# This is proof of concept code, it works, but is not suitable for production, and may even have nasty bugs in the | |
# threading section | |
# Refs: | |
# http://docs.amazonwebservices.com/AmazonS3/latest/API/index.html?mpUploadInitiate.html | |
# http://docs.amazonwebservices.com/AWSEC2/latest/UserGuide/index.html?using-query-api.html <-- Query API auth | |
# | |
require 'rubygems' | |
require 'fog' | |
require 'digest/md5' | |
require 'base64' | |
require 'fileutils' | |
# Credentials | |
key = 'AAAA' | |
secret = 'BBBB' | |
bucket = 'some-bucket' | |
# Setup connection | |
stor = Fog::Storage.new( | |
:provider => 'AWS', | |
:aws_access_key_id => key, | |
:aws_secret_access_key => secret | |
) | |
# Don't want to get caught out with any time errors | |
stor.sync_clock | |
# Take a test file and split it up, remove the initial / to use the filename and path as the key | |
#object_to_upload = '/tmp/linux-2.6.38.2.tar.bz2' | |
object_to_upload = '/tmp/ubuntu-10.04.2-server-amd64.iso' | |
object_key = object_to_upload[1..-1] | |
# Area to place the split file into | |
workdir = "/tmp/work" | |
FileUtils.mkdir_p(workdir) | |
# Split the file into chunks, the chunks are 000, 001, etc | |
#`split -C 10M -a 3 -d #{object_to_upload} #{workdir}` | |
`split -C 100M -a 3 -d #{object_to_upload} #{workdir}` | |
# Map of the file_part => md5 | |
parts = {} | |
# Get the Base64 encoded MD5 of each file | |
Dir.entries(workdir).each do |file| | |
next if file =~ /\.\./ | |
next if file =~ /\.$/ | |
md5 = Base64.encode64(Digest::MD5.file("#{workdir}/#{file}").digest).chomp! | |
full_path = "#{workdir}/#{file}" | |
parts[full_path] = md5 | |
end | |
### Now ready to perform the actual upload | |
# Initiate the upload and get the uploadid | |
multi_part_up = stor.initiate_multipart_upload(bucket, object_key, { 'x-amz-acl' => 'private' } ) | |
upload_id = multi_part_up.body["UploadId"] | |
# Lists for the threads and tags | |
tags = [] | |
threads = [] | |
sorted_parts = parts.sort_by do |d| | |
d[0][-1].to_i | |
end | |
sorted_parts.each_with_index do |entry, idx| | |
# Part numbers need to start at 1 | |
part_number = idx + 1 | |
# Reload to stop the connection timing out, useful when uploading large chunks | |
stor.reload | |
# Create a new thread for each part we are wanting to upload. | |
threads << Thread.new(entry) do |e| | |
print "DEBUG: Starting on File: #{e[0]} with MD5: #{e[1]} - this is part #{part_number} \n" | |
# Pass fog a file object to upload | |
File.open(e[0]) do |file_part| | |
# The part_number changes each time, as does the file_part, however as they are set outside of the threads being created I *think* they are | |
# safe. Really need to dig into the pickaxe threading section some more.. | |
part_upload = stor.upload_part(bucket, object_key, upload_id, part_number, file_part, { 'Content-MD5' => e[1] } ) | |
# You need to make sure the tags array has the tags in the correct order, else the upload won't complete | |
tags[idx] = part_upload.headers["ETag"] | |
print "#{part_upload.inspect} \n" # This will return when the part has uploaded | |
end | |
end | |
end | |
# Make sure all of our threads have finished before we continue | |
threads.each do |t| | |
begin | |
t.join | |
rescue Exception => e | |
puts "Failed: #{e.message}" | |
end | |
end | |
# Might want a stor.reload here... | |
completed_upload = stor.complete_multipart_upload(bucket, object_key, upload_id, tags) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment