Last active
August 1, 2017 05:51
-
-
Save MaxPowerWasTaken/1f4bdb6e3ff3d5491221e3a7c748cf1f to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
import pandas as pd | |
def check_var_size_requirements(df): | |
''' for integer columns, display smallest int type which could safely store values''' | |
# Iterate through int columns | |
# (pandas' read_* is good about assigning some int type if all vals are ints and no NaNs) | |
int_types = ['int', np.int8, np.int16, np.int32, np.int64, | |
np.uint8, np.uint16, np.uint32, np.uint64] | |
df = df.select_dtypes(include=int_types) | |
for col_name in df.columns: | |
# find max / min col values | |
col_min = df[col_name].min() | |
col_max = df[col_name].max() | |
# decide smallest int datatype which can hold max/min values above | |
if col_min >= 0: | |
# (can use unsigned ints) | |
use_type = ('uint8' if col_max < np.iinfo(np.uint8).max else | |
'uint16' if col_max < np.iinfo(np.uint16).max else | |
'uint32' if col_max < np.iinfo(np.uint32).max else | |
'uint64') | |
else: | |
# (have negative numbers, need signed integer types) | |
use_type = ('int8' if col_max < np.iinfo(np.int8).max and col_min > np.iinfo(np.int8).min else | |
'int16' if col_max < np.iinfo(np.int16).max and col_min > np.iinfo(np.int16).min else | |
'int32' if col_max < np.iinfo(np.int32).max and col_min > np.iinfo(np.int32).min else | |
'int64') | |
# (For now I prefer this to be informational, instead of automatically recasting types.) | |
print("{}\ncurrent type {}, could fit safely in type {} \n".format(col_name, | |
df[col_name].dtype, | |
use_type)) | |
######################## | |
# EXAMPLE / TEST | |
######################## | |
#test_df = pd.DataFrame({'a': [100], | |
# 'b': [1000], | |
# 'c': [10000], | |
# 'd': [100000], | |
# 'e': [-150], | |
# 'f': [100]}) | |
# | |
#test_df['f'] = test_df['f'].astype(np.int32) | |
# | |
#check_var_size_requirements(test_df) | |
# OUTPUT: | |
#a | |
#current type int64, could fit safely in type uint8 | |
# | |
#b | |
#current type int64, could fit safely in type uint16 | |
# | |
#c | |
#current type int64, could fit safely in type uint16 | |
# | |
#d | |
#current type int64, could fit safely in type uint32 | |
# | |
#e | |
#current type int64, could fit safely in type int16 | |
# | |
#f | |
#current type int32, could fit safely in type uint8 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment