Last active
July 8, 2023 07:20
-
-
Save mr-easy/8522fe2aee9040392fd284289caedbcf to your computer and use it in GitHub Desktop.
Comparison between different formats for read/write/size of Pandas DataFrame on S3 Bucket, running on AWS lambda.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
S3 CSV read: 0.8764402866363525 | |
S3 CSV write: 1.758180856704712 | |
S3 CSV size: 5998538 | |
S3 JSON write: 0.9272201061248779 | |
S3 JSON read: 2.5027835369110107 | |
S3 JSON size: 12003799 | |
S3 Parquet write: 0.5380315780639648 | |
S3 Parquet read: 0.42629337310791016 | |
S3 Parquet size: 1171252 | |
S3 Feather write: 0.2664041519165039 | |
S3 Feather read: 0.22553682327270508 | |
S3 Feather size: 7569978 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import json | |
import time | |
import boto3 | |
import pandas as pd | |
from io import BytesIO | |
import awswrangler as wr | |
from pyarrow.feather import write_feather | |
s3_client = boto3.client('s3') | |
BUCKET_NAME = "<bucket-name>" | |
FILE_NAME = "df" | |
def lambda_handler(event, context): | |
s3_url = f"s3://{BUCKET_NAME}/{FILE_NAME}" | |
# CSV | |
start_time = time.time() | |
df = wr.s3.read_csv([s3_url+".csv"]) | |
end_time = time.time() | |
print(f"S3 CSV read: {end_time-start_time}") | |
start_time = time.time() | |
wr.s3.to_csv(df, s3_url+".csv", index=False) | |
end_time = time.time() | |
print(f"S3 CSV write: {end_time-start_time}") | |
resp = s3_client.head_object(Bucket=BUCKET_NAME, Key=FILE_NAME+'.csv') | |
print(f"S3 CSV size: {resp['ContentLength']}") | |
#JSON | |
start_time = time.time() | |
wr.s3.to_json(df, s3_url+'.json') | |
end_time = time.time() | |
print(f"S3 JSON write: {end_time-start_time}") | |
start_time = time.time() | |
df_loaded = wr.s3.read_json([s3_url+'.json']) | |
end_time = time.time() | |
print(f"S3 JSON read: {end_time-start_time}") | |
resp = s3_client.head_object(Bucket=BUCKET_NAME, Key=FILE_NAME+'.json') | |
print(f"S3 JSON size: {resp['ContentLength']}") | |
# Parquet | |
start_time = time.time() | |
wr.s3.to_parquet(df, s3_url+'.parquet') | |
end_time = time.time() | |
print(f"S3 Parquet write: {end_time-start_time}") | |
start_time = time.time() | |
df_loaded = wr.s3.read_parquet([s3_url+'.parquet']) | |
end_time = time.time() | |
print(f"S3 Parquet read: {end_time-start_time}") | |
resp = s3_client.head_object(Bucket=BUCKET_NAME, Key=FILE_NAME+'.parquet') | |
print(f"S3 Parquet size: {resp['ContentLength']}") | |
# Feather | |
start_time = time.time() | |
with BytesIO() as f: | |
write_feather(df, f) | |
s3_client.put_object( | |
Body=f.getvalue(), | |
Bucket=BUCKET_NAME, | |
Key=FILE_NAME+'.feather' | |
) | |
end_time = time.time() | |
print(f"S3 Feather write: {end_time-start_time}") | |
start_time = time.time() | |
retr = s3_client.get_object( | |
Bucket=BUCKET_NAME, | |
Key=FILE_NAME+'.feather' | |
) | |
df_loaded = pd.read_feather(BytesIO(retr['Body'].read())) | |
end_time = time.time() | |
print(f"S3 Feather read: {end_time-start_time}") | |
resp = s3_client.head_object(Bucket=BUCKET_NAME, Key=FILE_NAME+'.feather') | |
print(f"S3 Feather size: {resp['ContentLength']}") | |
return |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment