-
-
Save teasherm/bb73f21ed2f3b46bc1c2ca48ec2c1cf5 to your computer and use it in GitHub Desktop.
import argparse | |
import os | |
import boto3 | |
class S3MultipartUpload(object): | |
# AWS throws EntityTooSmall error for parts smaller than 5 MB | |
PART_MINIMUM = int(5e6) | |
def __init__(self, | |
bucket, | |
key, | |
local_path, | |
part_size=int(15e6), | |
profile_name=None, | |
region_name="eu-west-1", | |
verbose=False): | |
self.bucket = bucket | |
self.key = key | |
self.path = local_path | |
self.total_bytes = os.stat(local_path).st_size | |
self.part_bytes = part_size | |
assert part_size > self.PART_MINIMUM | |
assert (self.total_bytes % part_size == 0 | |
or self.total_bytes % part_size > self.PART_MINIMUM) | |
self.s3 = boto3.session.Session( | |
profile_name=profile_name, region_name=region_name).client("s3") | |
if verbose: | |
boto3.set_stream_logger(name="botocore") | |
def abort_all(self): | |
mpus = self.s3.list_multipart_uploads(Bucket=self.bucket) | |
aborted = [] | |
print("Aborting", len(mpus), "uploads") | |
if "Uploads" in mpus: | |
for u in mpus["Uploads"]: | |
upload_id = u["UploadId"] | |
aborted.append( | |
self.s3.abort_multipart_upload( | |
Bucket=self.bucket, Key=self.key, UploadId=upload_id)) | |
return aborted | |
def create(self): | |
mpu = self.s3.create_multipart_upload(Bucket=self.bucket, Key=self.key) | |
mpu_id = mpu["UploadId"] | |
return mpu_id | |
def upload(self, mpu_id): | |
parts = [] | |
uploaded_bytes = 0 | |
with open(self.path, "rb") as f: | |
i = 1 | |
while True: | |
data = f.read(self.part_bytes) | |
if not len(data): | |
break | |
part = self.s3.upload_part( | |
Body=data, Bucket=self.bucket, Key=self.key, UploadId=mpu_id, PartNumber=i) | |
parts.append({"PartNumber": i, "ETag": part["ETag"]}) | |
uploaded_bytes += len(data) | |
print("{0} of {1} uploaded ({2:.3f}%)".format( | |
uploaded_bytes, self.total_bytes, | |
as_percent(uploaded_bytes, self.total_bytes))) | |
i += 1 | |
return parts | |
def complete(self, mpu_id, parts): | |
result = self.s3.complete_multipart_upload( | |
Bucket=self.bucket, | |
Key=self.key, | |
UploadId=mpu_id, | |
MultipartUpload={"Parts": parts}) | |
return result | |
# Helper | |
def as_percent(num, denom): | |
return float(num) / float(denom) * 100.0 | |
def parse_args(): | |
parser = argparse.ArgumentParser(description='Multipart upload') | |
parser.add_argument('--bucket', required=True) | |
parser.add_argument('--key', required=True) | |
parser.add_argument('--path', required=True) | |
parser.add_argument('--region', default="eu-west-1") | |
parser.add_argument('--profile', default=None) | |
return parser.parse_args() | |
def main(): | |
args = parse_args() | |
mpu = S3MultipartUpload( | |
args.bucket, | |
args.key, | |
args.path, | |
profile_name=args.profile, | |
region_name=args.region) | |
# abort all multipart uploads for this bucket (optional, for starting over) | |
mpu.abort_all() | |
# create new multipart upload | |
mpu_id = mpu.create() | |
# upload parts | |
parts = mpu.upload(mpu_id) | |
# complete multipart upload | |
print(mpu.complete(mpu_id, parts)) | |
if __name__ == "__main__": | |
main() |
Thanks a lot, this has been most useful!
I have created a modified version able to resume the upload after a failure, useful if the network fails or your session credentials expire.
This is super helpful and very clean, thanks.
However, while searching for this, I also found a dead simple way of doing it where you can force multipart by setting a size threshold, in the AWS docs, just 2 lines of code:
https://boto3.amazonaws.com/v1/documentation/api/latest/guide/s3.html
How would this be modified to generated a presigned URL? I'm able to generate one, but it has a signature verification error, so I'm thinking that I'm missing something that sets the algorithm / version. Here are details if anyone can help! I'm trying to use the s3 boto3 client for a minio server for multipart upload with a presigned url because the minio-py doesn't support that.
Update - I think I figured out how to add the key - the config parameter below is newly added
from botocore.client import Config
...
s3_external = session.client(
"s3",
use_ssl=MINIO_SSL,
region_name=MINIO_REGION,
endpoint_url=MINIO_HTTP_PREFIX + MINIO_EXTERNAL_SERVER,
verify=False,
config=Config(signature_version='s3v4'),
)
The signed url generated now has the (previously missing) algorithm, etc. headers, however the signature doesn't match, so I'm wondering if the key generated by the client (Singularity / Sylabs scs-library-client) is different than what I am specifying - that almost must be it...
Update: i think the issue is that the signature includes the host, which is different inside (minio:9000) as opposed to outside (127.0.0.1:9000) the container, reading this post. boto/boto3#1982 (comment)
Nice brother, great great job
@balukrishnans are you talking to me or @teasherm? To follow up with my question above for future lurkers, it was a non-trivial thing that wound up needing a PR to the Minio Python client. Details about my particular implementation are here. And if you are referencing @teasherm, I agree, great job and thank you for posting this!
Thank you for writing/posting this. I'm pretty sure this is the only way to nicely do a multipart and also have the ability to have amazon verify the md5-sum(if you add that bit to the upload that is). One point:
assert (self.total_bytes % part_size == 0 or self.total_bytes % part_size > self.PART_MINIMUM)
isn't quite right thought as the last part can certainly be under the aws minimum for part you can verify that the cli does this often by verifying the etag against the combined md5 of each part.
This is a gem. It's crazy how there's barely any documentation on this stuff 💎
This is a gem. It's crazy how there's barely any documentation on this stuff 💎
Exactly! I dug tons of explanations and code samples until I found this one: Python S3 Multipart File Upload with Metadata and Progress Indicator
Easy. Thanks for a nice example. Code is cleaner than documentation.
Great work.
For anyone attempting to do this with the AWS CLI, but still using the lower-level aws s3api
:
https://gist.github.com/hiAndrewQuinn/1935fdaf29ae2f40f90ef82341866a35
Thank you for writing/posting this. I'm pretty sure this is the only way to nicely do a multipart and also have the ability to have amazon verify the md5-sum(if you add that bit to the upload that is). One point:
assert (self.total_bytes % part_size == 0 or self.total_bytes % part_size > self.PART_MINIMUM)
isn't quite right thought as the last part can certainly be under the aws minimum for part you can verify that the cli does this often by verifying the etag against the combined md5 of each part.
I am facing this issue can you help how I can tackle this thing
super, A+++, 👍
used this script to upload 50GB files from a kube pod to S3.