ervinne13 · April 19, 2022 10:05 · ervinne13 · Apr 19, 2022
diff --git a/csv_to_parquet.py b/csv_to_parquet.py
 from io import BytesIO
 import boto3
 import pandas as pd
 from os import environ


 def convert(bucket, key):
    s3_client = boto3.client('s3', region_name=environ['REGION'])
    s3_object = s3_client.get_object(Bucket=bucket, Key=key)

    df = pd.read_csv(s3_object['Body'])
    df.columns.astype(str)

    target_bucket = f"{bucket}-parquet"
    target_key = f"{key.split('-')[0]}.parquet"

    parquet_out_buffer = BytesIO()
    df.to_parquet(parquet_out_buffer, index=False, engine='fastparquet')

    s3_res = boto3.resource('s3')
    s3_res.Object(target_bucket, target_key).put(Body=parquet_out_buffer.getvalue())

    return {
        'bucket': target_bucket,
        'key': target_key
    }
	from io import BytesIO
	import boto3
	import pandas as pd
	from os import environ


	def convert(bucket, key):
	s3_client = boto3.client('s3', region_name=environ['REGION'])
	s3_object = s3_client.get_object(Bucket=bucket, Key=key)

	df = pd.read_csv(s3_object['Body'])
	df.columns.astype(str)

	target_bucket = f"{bucket}-parquet"
	target_key = f"{key.split('-')[0]}.parquet"

	parquet_out_buffer = BytesIO()
	df.to_parquet(parquet_out_buffer, index=False, engine='fastparquet')

	s3_res = boto3.resource('s3')
	s3_res.Object(target_bucket, target_key).put(Body=parquet_out_buffer.getvalue())

	return {
	'bucket': target_bucket,
	'key': target_key
	}