Last active
July 7, 2020 16:31
-
-
Save h5rdly/30e3dbb28f46ca5fdc404155438017c9 to your computer and use it in GitHub Desktop.
Download a file from S3 using "vanilla" standard library Python
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import hashlib, hmac, socket, ssl | |
from datetime import datetime | |
try: | |
from urlparse import urlsplit | |
except: | |
from urllib.parse import urlsplit | |
ALGORTHM = 'AWS4-HMAC-SHA256' | |
sign = lambda key, msg: hmac.new(key, msg.encode('utf-8'), hashlib.sha256).digest() | |
def sign_headers(headers, url, access_key, secret_key, region = 'us-east-1'): | |
method = 'GET' | |
# Get host and parsed datetime and date used by AWS | |
parsed_url = urlsplit(url) | |
host = parsed_url.netloc | |
date = datetime.utcnow() | |
aws_datetime = date.strftime("%Y%m%dT%H%M%SZ") | |
aws_date = date.strftime("%Y%m%d") | |
# Generate scope and scoped credential strings, and the signing key | |
scope = '/'.join([aws_date, region, 's3', 'aws4_request']) | |
credential = '/'.join([access_key, scope]) | |
signing_key = sign(sign(sign(sign(('AWS4' + secret_key).encode('utf-8'), aws_date), region), 's3'), 'aws4_request') | |
# Fill up all headers except 'Authorization' | |
headers['Host'] = host | |
headers['X-Amz-Date'] = aws_datetime | |
headers['X-Amz-Content-Sha256'] = u'UNSIGNED-PAYLOAD' | |
# Format header keys and data for the upcoming AWS atrings | |
sorted_headers_string = ';'.join([header.lower().strip() for header in sorted(headers)]) | |
canonical_header_list = [header.lower().strip() + ':' + str(headers[header]).strip() for header in sorted(headers)] | |
# Geenerate canonical request and string to be signed | |
prefix = [method, parsed_url.path, parsed_url.query] | |
suffix = ['', sorted_headers_string, u'UNSIGNED-PAYLOAD'] # '' to alow 2 '\n' | |
canonical_req = '\n'.join(prefix + canonical_header_list + suffix) | |
string_to_sign = '\n'.join([ALGORTHM, aws_datetime, scope, hashlib.sha256(canonical_req.encode('utf-8')).hexdigest()]) | |
signature = hmac.new(signing_key, string_to_sign.encode('utf-8'), hashlib.sha256).hexdigest() | |
# Finally generate the Authoization header with signing string_to_sign | |
headers['Authorization'] = ALGORTHM + ' Credential=' + credential + ', ' + 'SignedHeaders=' + sorted_headers_string + ', ' + 'Signature=' + signature | |
return headers | |
def download_s3_chunk(bucket, key, start, end, access_key, secret_key, endpoint = 'https://s3.amazonaws.com', region = 'us-east-1'): | |
''' Download part of an S3 stored file using vanilla Python ''' | |
headers = {'Range': 'bytes={}-{}'.format(start, end), 'User-Agent': 'ssup'} | |
headers = sign_headers(headers, endpoint, access_key, secret_key) | |
# Raw message to send via socket | |
s3_message_parts = ['GET {} HTTP/1.1', | |
'Host: {}', | |
'Connection: keep-alive', | |
'Accept-Encoding: gzip, deflate', | |
'Accept: */*', | |
'User-Agent: ssup', | |
'X-Amz-Content-Sha256: UNSIGNED-PAYLOAD', | |
'Range: bytes={}-{}', | |
'X-Amz-Date: {}', | |
'Authorization: {}', | |
'\r\n'] | |
message_params = '/' + bucket + '/' + key, headers['Host'], start, end, headers['X-Amz-Date'], headers['Authorization'] | |
s3_download_message = '\r\n'.join(s3_message_parts).format(message_params) | |
s = ssl.wrap_socket(socket.socket()) | |
s.connect(('s3.amazonaws.com', 443)) | |
s.sendall(s3_download_message) | |
#Implement proper retrieval loop | |
return s.recv(), s.recv() | |
if __name__=='__main__': | |
# Adjust to get arguments from command prompt | |
from sys import argv as args | |
# Credentials | |
access_key = 'access' | |
secret_key = 'secret' | |
# Bucket, key and location info | |
bucket = 'my_bucket' | |
key = 'my_key' | |
# Chunk of key to download | |
start = 20 | |
end = 100 | |
header, chunk = download_s3_chunk(bucket, key, start, end, access_key, secret_key) |
Not sure I understand, are you talking about using boto3
?
I wanted to understand how the communication with S3 works conceptually, and boto
was a bit heavy.
Sorry, I understand now you were referring to using socket
. I wanted to go vanilla both to understand the exact mechanics, and to make sure it runs anywhere with Python. Perhaps urllib
can be used here to save some code while not requiring anything outside of Python.
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
why don't you use
request
instead?