iamaziz · November 22, 2022 14:45 · avriiil · Jul 21, 2021 · iamaziz · Jul 21, 2021
diff --git a/read_csv_files_in_tar_gz_from_s3_bucket.py b/read_csv_files_in_tar_gz_from_s3_bucket.py
 # -- read csv files from tar.gz in S3 with S3FS and tarfile (https://s3fs.readthedocs.io/en/latest/)

 bucket = 'mybucket'
 key = 'mycompressed_csv_files.tar.gz'

 import s3fs
 import tarfile
 import io
 import pandas as pd

 fs = s3fs.S3FileSystem()

 f = fs.open(f'{bucket}/{key}', 'rb')
 tar = tarfile.open(f, 'r:gz')
 csv_files = [f.name for f in tar.getmembers() if f.name.endswith('.csv')]
 csv_file = csv_files[0] # here we read first csv file only
 csv_contents = tar.extractfile(csv_file).read()
 df = pd.read_csv(io.BytesIO(csv_contents), encoding='utf8')
 f.close()
	# -- read csv files from tar.gz in S3 with S3FS and tarfile (https://s3fs.readthedocs.io/en/latest/)

	bucket = 'mybucket'
	key = 'mycompressed_csv_files.tar.gz'

	import s3fs
	import tarfile
	import io
	import pandas as pd

	fs = s3fs.S3FileSystem()

	f = fs.open(f'{bucket}/{key}', 'rb')
	tar = tarfile.open(f, 'r:gz')
	csv_files = [f.name for f in tar.getmembers() if f.name.endswith('.csv')]
	csv_file = csv_files[0] # here we read first csv file only
	csv_contents = tar.extractfile(csv_file).read()
	df = pd.read_csv(io.BytesIO(csv_contents), encoding='utf8')
	f.close()