Skip to content

Instantly share code, notes, and snippets.

@mr-easy
Last active July 8, 2023 07:20
Show Gist options
  • Save mr-easy/8522fe2aee9040392fd284289caedbcf to your computer and use it in GitHub Desktop.
Save mr-easy/8522fe2aee9040392fd284289caedbcf to your computer and use it in GitHub Desktop.
Comparison between different formats for read/write/size of Pandas DataFrame on S3 Bucket, running on AWS lambda.
S3 CSV read: 0.8764402866363525
S3 CSV write: 1.758180856704712
S3 CSV size: 5998538
S3 JSON write: 0.9272201061248779
S3 JSON read: 2.5027835369110107
S3 JSON size: 12003799
S3 Parquet write: 0.5380315780639648
S3 Parquet read: 0.42629337310791016
S3 Parquet size: 1171252
S3 Feather write: 0.2664041519165039
S3 Feather read: 0.22553682327270508
S3 Feather size: 7569978
import json
import time
import boto3
import pandas as pd
from io import BytesIO
import awswrangler as wr
from pyarrow.feather import write_feather
s3_client = boto3.client('s3')
BUCKET_NAME = "<bucket-name>"
FILE_NAME = "df"
def lambda_handler(event, context):
s3_url = f"s3://{BUCKET_NAME}/{FILE_NAME}"
# CSV
start_time = time.time()
df = wr.s3.read_csv([s3_url+".csv"])
end_time = time.time()
print(f"S3 CSV read: {end_time-start_time}")
start_time = time.time()
wr.s3.to_csv(df, s3_url+".csv", index=False)
end_time = time.time()
print(f"S3 CSV write: {end_time-start_time}")
resp = s3_client.head_object(Bucket=BUCKET_NAME, Key=FILE_NAME+'.csv')
print(f"S3 CSV size: {resp['ContentLength']}")
#JSON
start_time = time.time()
wr.s3.to_json(df, s3_url+'.json')
end_time = time.time()
print(f"S3 JSON write: {end_time-start_time}")
start_time = time.time()
df_loaded = wr.s3.read_json([s3_url+'.json'])
end_time = time.time()
print(f"S3 JSON read: {end_time-start_time}")
resp = s3_client.head_object(Bucket=BUCKET_NAME, Key=FILE_NAME+'.json')
print(f"S3 JSON size: {resp['ContentLength']}")
# Parquet
start_time = time.time()
wr.s3.to_parquet(df, s3_url+'.parquet')
end_time = time.time()
print(f"S3 Parquet write: {end_time-start_time}")
start_time = time.time()
df_loaded = wr.s3.read_parquet([s3_url+'.parquet'])
end_time = time.time()
print(f"S3 Parquet read: {end_time-start_time}")
resp = s3_client.head_object(Bucket=BUCKET_NAME, Key=FILE_NAME+'.parquet')
print(f"S3 Parquet size: {resp['ContentLength']}")
# Feather
start_time = time.time()
with BytesIO() as f:
write_feather(df, f)
s3_client.put_object(
Body=f.getvalue(),
Bucket=BUCKET_NAME,
Key=FILE_NAME+'.feather'
)
end_time = time.time()
print(f"S3 Feather write: {end_time-start_time}")
start_time = time.time()
retr = s3_client.get_object(
Bucket=BUCKET_NAME,
Key=FILE_NAME+'.feather'
)
df_loaded = pd.read_feather(BytesIO(retr['Body'].read()))
end_time = time.time()
print(f"S3 Feather read: {end_time-start_time}")
resp = s3_client.head_object(Bucket=BUCKET_NAME, Key=FILE_NAME+'.feather')
print(f"S3 Feather size: {resp['ContentLength']}")
return
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment