Created
January 18, 2021 22:11
-
-
Save MonksterFX/e2cc635e62dbec496edf044e06243633 to your computer and use it in GitHub Desktop.
Infer Pandas DataFrame DataTypes
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def infer_df(df, hard_mode=False, float_to_int=False, mf=None): | |
ret = {} | |
# ToDo: How much does auto convertion cost | |
# set multiplication factor | |
mf = 1 if hard_mode else 0.5 | |
# set supported datatyp | |
integers = ['int8', 'int16', 'int32', 'int64'] | |
floats = ['float16', 'float32', 'float64'] | |
# ToDo: Unsigned Integer | |
# generate borders for each datatype | |
b_integers = [(np.iinfo(i).min, np.iinfo(i).max, i) for i in integers] | |
b_floats = [(np.finfo(f).min, np.finfo(f).max, f) for f in floats] | |
for c in df.columns: | |
_type = df[c].dtype | |
# if a column is set to float, but could be int | |
if float_to_int and np.issubdtype(_type, np.floating): | |
if np.sum(np.remainder(df[c], 1)) == 0: | |
df[c] = df[c].astype('int64') | |
_type = df[c].dtype | |
# convert type of column to smallest possible | |
if np.issubdtype(_type, np.integer) or np.issubdtype(_type, np.floating): | |
borders = b_integers if np.issubdtype(_type, np.integer) else b_floats | |
_min = df[c].min() | |
_max = df[c].max() | |
for b in borders: | |
if b[0] * mf < _min and _max < b[1] * mf: | |
ret[c] = b[2] | |
break | |
if _type == 'object' and len(df[c].unique()) / len(df) < 0.1: | |
ret[c] = 'category' | |
return ret |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment