Last active
May 18, 2020 13:50
-
-
Save roehst/f8aaa815c03ee7a628a10c20c69c2a23 to your computer and use it in GitHub Desktop.
Reducing memory usage in Pandas
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def reduce_mem_usage(df, verbose=True): | |
numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64'] | |
start_mem = df.memory_usage().sum() / 1024**2 | |
for col in df.columns: | |
col_type = df[col].dtypes | |
if col_type in numerics: | |
c_min = df[col].min() | |
c_max = df[col].max() | |
if str(col_type)[:3] == 'int': | |
if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max: | |
df[col] = df[col].astype(np.int8) | |
elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max: | |
df[col] = df[col].astype(np.int16) | |
elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max: | |
df[col] = df[col].astype(np.int32) | |
elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max: | |
df[col] = df[col].astype(np.int64) | |
else: | |
if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max: | |
df[col] = df[col].astype(np.float16) | |
elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max: | |
df[col] = df[col].astype(np.float32) | |
else: | |
df[col] = df[col].astype(np.float64) | |
end_mem = df.memory_usage().sum() / 1024**2 | |
if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem)) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
INT_TYPES_NP = [np.int8, np.int16, np.int32, np.int64] | |
INT_TYPES = ['int16', 'int32', 'int64'] | |
FLOAT_TYPES_NP = [np.float16, np.float32, np.float64] | |
FLOAT_TYPE = ['float16', 'float32', 'float64'] | |
def reduce_mem_usage(df, verbose=True): | |
start_mem = df.memory_usage().sum() / 1024**2 | |
# Try to use the narrowest type possible | |
# for int and float columns. | |
for col in df.columns: | |
c_min, c_max = df[col].min(), df[col].max() | |
col_type = df[col].dtypes | |
if col_type in INT_TYPES: | |
for int_type in INT_TYPES_NP: | |
int_min = np.iinfo(int_type).min | |
int_max = np.iinfo(int_type).max | |
# Can we fit all valus in col in this bit-width? | |
if int_min <= c_min <= c_max <= int_max: | |
df[col] = df[col].astype(int_type) | |
break | |
elif col_type in FLOAT_TYPES: | |
for float_type in FLOAT_TYPES_NP: | |
float_min = np.iinfo(float_type).min | |
float_max = np.iinfo(float_type).max | |
# Can we fit all valus in col in this bit-width? | |
if float_min <= c_min <= c_max <= float_max: | |
df[col] = df[col].astype(float_type) | |
break | |
else: | |
# Can not reduce usage here. | |
pass | |
end_mem = df.memory_usage().sum() / 1024**2 | |
if verbose: | |
print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format( | |
end_mem, 100 * (start_mem - end_mem) / start_mem)) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def att_dtype(column, types, info): | |
for np_type in types: | |
if column.min() > info(np_type).min and column.max() < info(np_type).max: | |
column = column.astype(np_type) | |
return column | |
def reduce_mem_usage(column, verbose=True): | |
col_type = str(column.dtype) | |
if 'int' in col_type: | |
np_int = [np.int8, np.int16, np.int32, np.int64][::-1] | |
info = np.iinfo | |
att_column = att_dtype(column, np_int, info) | |
elif 'float' in col_type: | |
np_float = [np.float16, np.float32, np.float64][::-1] | |
info = np.finfo | |
att_column = att_dtype(column, np_float, info) | |
if verbose: | |
start_mem = column.memory_usage() / 1024**2 | |
end_mem = att_column.memory_usage() / 1024**2 | |
reduction = 100 * (start_mem - end_mem) / start_mem | |
print(f'Column {column.name} to type {str(att_column.dtype)} with {reduction}% reduction') | |
return att_column | |
df = df.apply(reduce_mem_usage, axis = 1) | |
df.dtypes |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment