Skip to content

Instantly share code, notes, and snippets.

@MonksterFX
Created January 18, 2021 22:11
Show Gist options
  • Save MonksterFX/e2cc635e62dbec496edf044e06243633 to your computer and use it in GitHub Desktop.
Save MonksterFX/e2cc635e62dbec496edf044e06243633 to your computer and use it in GitHub Desktop.
Infer Pandas DataFrame DataTypes
def infer_df(df, hard_mode=False, float_to_int=False, mf=None):
ret = {}
# ToDo: How much does auto convertion cost
# set multiplication factor
mf = 1 if hard_mode else 0.5
# set supported datatyp
integers = ['int8', 'int16', 'int32', 'int64']
floats = ['float16', 'float32', 'float64']
# ToDo: Unsigned Integer
# generate borders for each datatype
b_integers = [(np.iinfo(i).min, np.iinfo(i).max, i) for i in integers]
b_floats = [(np.finfo(f).min, np.finfo(f).max, f) for f in floats]
for c in df.columns:
_type = df[c].dtype
# if a column is set to float, but could be int
if float_to_int and np.issubdtype(_type, np.floating):
if np.sum(np.remainder(df[c], 1)) == 0:
df[c] = df[c].astype('int64')
_type = df[c].dtype
# convert type of column to smallest possible
if np.issubdtype(_type, np.integer) or np.issubdtype(_type, np.floating):
borders = b_integers if np.issubdtype(_type, np.integer) else b_floats
_min = df[c].min()
_max = df[c].max()
for b in borders:
if b[0] * mf < _min and _max < b[1] * mf:
ret[c] = b[2]
break
if _type == 'object' and len(df[c].unique()) / len(df) < 0.1:
ret[c] = 'category'
return ret
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment