Skip to content

Instantly share code, notes, and snippets.

@jaklinger
Created November 7, 2018 14:01
Show Gist options
  • Save jaklinger/1985c2676be31403f9f5da5f55b29741 to your computer and use it in GitHub Desktop.
Save jaklinger/1985c2676be31403f9f5da5f55b29741 to your computer and use it in GitHub Desktop.
Read entire content of s3 bucket to pandas dataframe
import boto3
import pandas as pd
from io import BytesIO
bucket = "innovation-mapping-general"
directory = "nih_all_processed_data/"
s3 = boto3.resource('s3')
dfs = []
for key in s3.Bucket(bucket).objects.all():
if not key.key.startswith(directory):
continue
if key.key == directory:
continue
#print(key.key, str(key.key))
obj = s3.Object(bucket, key.key)
with BytesIO(obj.get()['Body'].read()) as bio:
df = pd.read_json(bio)
dfs.append(df)
if len(dfs) > 3:
break
df = pd.concat(dfs)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment