Skip to content

Instantly share code, notes, and snippets.

@johntbush
Last active November 7, 2020 07:25
Show Gist options
  • Save johntbush/5c3f10263d065acd71c6cf03ddc8e9a8 to your computer and use it in GitHub Desktop.
Save johntbush/5c3f10263d065acd71c6cf03ddc8e9a8 to your computer and use it in GitHub Desktop.
s3_inventory
import json
import gzip
import pandas as pd
import dateutil.parser
def load_df(data_file):
names = ['Bucket', 'Key', 'Size', 'LastModifiedDate', 'ETag', 'StorageClass', 'IsMultipartUploaded']
return pd.read_csv(data_file, names=names)
def old_files(df, year):
df['LastModifiedDate'] = pd.to_datetime(df['LastModifiedDate'])
end = dateutil.parser.parse(year + '-01-01')
mask = (df['LastModifiedDate'] < end)
old_df = df.loc[mask]
num = len(old_df.index)
bytes = old_df['Size'].sum()
return num, bytes
f = open('manifest.json','r')
j = json.load(f)
base_url = 'https://s3.amazonaws.com/your_bucket/'
count = 0
size = 10737418240
total_bytes = 0
number_files = 0
old_files_2015 = 0
old_files_2015_size = 0
old_files_2016 = 0
old_files_2016_size = 0
out = {}
for x in j['files']:
file_name = x['key'].split('/')[-1]
full_file_name = 'data/' + file_name
print 'ingesting: ' + str(full_file_name)
with gzip.open(full_file_name, 'rb') as data_file:
df = load_df(data_file)
number_files = number_files + len(df.index)
ret1 = old_files(df, '2015')
old_files_2015 = old_files_2015 + ret1[0]
old_files_2015_size = old_files_2015_size + ret1[1]
ret2 = old_files(df, '2016')
old_files_2016 = old_files_2016 + ret2[0]
old_files_2016_size = old_files_2016_size + ret2[1]
local_bytes = df['Size'].sum()
filtered_df = df[df['Size'] > size]
local_count = len(filtered_df.index)
count = local_count + count
total_bytes = local_bytes + total_bytes
out = {
'total_bytes': total_bytes,
'total_file_count': number_files,
'old_files_2015': old_files_2015,
'old_file_size_2015': old_files_2015_size,
'old_files_2016': old_files_2016,
'old_file_size_2016': old_files_2016_size,
'large_files': count,
}
print str(out)
data_file.close()
f.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment