Skip to content

Instantly share code, notes, and snippets.

@uhho
Last active December 2, 2022 18:57
Show Gist options
  • Save uhho/a1490ae2abd112b556dcd539750aa151 to your computer and use it in GitHub Desktop.
Save uhho/a1490ae2abd112b556dcd539750aa151 to your computer and use it in GitHub Desktop.
Streaming pandas DataFrame to/from S3 with on-the-fly processing and GZIP compression
def s3_to_pandas(client, bucket, key, header=None):
# get key using boto3 client
obj = client.get_object(Bucket=bucket, Key=key)
gz = gzip.GzipFile(fileobj=obj['Body'])
# load stream directly to DF
return pd.read_csv(gz, header=header, dtype=str)
def s3_to_pandas_with_processing(client, bucket, key, header=None):
# get key using boto3 client
obj = client.get_object(Bucket=bucket, Key=key)
gz = gzip.GzipFile(fileobj=obj['Body'])
# replace some characters in incomming stream and load it to DF
lines = "\n".join([line.replace('?', ' ') for line in gz.read().decode('utf-8').split('\n')])
return pd.read_csv(io.StringIO(lines), header=None, dtype=str)
def pandas_to_s3(df, client, bucket, key):
# write DF to string stream
csv_buffer = io.StringIO()
df.to_csv(csv_buffer, index=False)
# reset stream position
csv_buffer.seek(0)
# create binary stream
gz_buffer = io.BytesIO()
# compress string stream using gzip
with gzip.GzipFile(mode='w', fileobj=gz_buffer) as gz_file:
gz_file.write(bytes(csv_buffer.getvalue(), 'utf-8'))
# write stream to S3
obj = client.put_object(Bucket=bucket, Key=key, Body=gz_buffer.getvalue())
@gudata
Copy link

gudata commented Oct 26, 2021

This gist is awesome! thank you!

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment