MonksterFX · January 18, 2021 22:11
diff --git a/pandas_dataframe_infer_dtypes.py b/pandas_dataframe_infer_dtypes.py
 def infer_df(df, hard_mode=False, float_to_int=False, mf=None):
    ret = {}

    # ToDo: How much does auto convertion cost
    # set multiplication factor
    mf = 1 if hard_mode else 0.5

    # set supported datatyp
    integers = ['int8', 'int16', 'int32', 'int64']
    floats = ['float16', 'float32', 'float64']

    # ToDo: Unsigned Integer

    # generate borders for each datatype
    b_integers = [(np.iinfo(i).min, np.iinfo(i).max, i) for i in integers]
    b_floats = [(np.finfo(f).min, np.finfo(f).max, f) for f in floats]

    for c in df.columns:
        _type = df[c].dtype

        # if a column is set to float, but could be int
        if float_to_int and np.issubdtype(_type, np.floating):
            if np.sum(np.remainder(df[c], 1)) == 0:
                df[c] = df[c].astype('int64')
                _type = df[c].dtype

        # convert type of column to smallest possible
        if np.issubdtype(_type, np.integer) or np.issubdtype(_type, np.floating):
            borders = b_integers if np.issubdtype(_type, np.integer) else b_floats

            _min = df[c].min()
            _max = df[c].max()

            for b in borders:
                if b[0] * mf < _min and _max < b[1] * mf:
                    ret[c] = b[2]
                    break

        if _type == 'object' and len(df[c].unique()) / len(df) < 0.1:
            ret[c] = 'category'

    return ret
	def infer_df(df, hard_mode=False, float_to_int=False, mf=None):
	ret = {}

	# ToDo: How much does auto convertion cost
	# set multiplication factor
	mf = 1 if hard_mode else 0.5

	# set supported datatyp
	integers = ['int8', 'int16', 'int32', 'int64']
	floats = ['float16', 'float32', 'float64']

	# ToDo: Unsigned Integer

	# generate borders for each datatype
	b_integers = [(np.iinfo(i).min, np.iinfo(i).max, i) for i in integers]
	b_floats = [(np.finfo(f).min, np.finfo(f).max, f) for f in floats]

	for c in df.columns:
	_type = df[c].dtype

	# if a column is set to float, but could be int
	if float_to_int and np.issubdtype(_type, np.floating):
	if np.sum(np.remainder(df[c], 1)) == 0:
	df[c] = df[c].astype('int64')
	_type = df[c].dtype

	# convert type of column to smallest possible
	if np.issubdtype(_type, np.integer) or np.issubdtype(_type, np.floating):
	borders = b_integers if np.issubdtype(_type, np.integer) else b_floats

	_min = df[c].min()
	_max = df[c].max()

	for b in borders:
	if b[0] * mf < _min and _max < b[1] * mf:
	ret[c] = b[2]
	break

	if _type == 'object' and len(df[c].unique()) / len(df) < 0.1:
	ret[c] = 'category'

	return ret