deedy5 · August 28, 2022 11:34
diff --git a/df_reduce_memory.py b/df_reduce_memory.py
 def df_reduce_memory(df):
    """Reduce pandas dataframe memory size

    Args:
        df (pd.DataFrame): pandas dataframe

    Returns:
        pd.DataFrame: reduced pandas dataframe
    """
    # Example: df = pd.read_csv(data_dir, parse_dates=True, keep_date_col=True)

    start_mem = df.memory_usage(deep=True).sum() / 1024**2
    print(f"Memory usage of dataframe is {start_mem:.2f} MB")

    for col in df.columns:
        col_type = df[col].dtype
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == "int":
                for int_type in (np.int8, np.int16, np.int32, np.int64):
                    if np.iinfo(int_type).min < c_min and np.iinfo(int_type).max > c_max:
                        df[col] = df[col].astype(int_type)
                        break
            else:
                for float_type in (np.float16, np.float32, np.float64):
                    if np.finfo(float_type).min < c_min and np.finfo(float_type).max > c_max:
                        df[col] = df[col].astype(float_type)
                        break
        else:
            # if column type = object. Convert to category if unique rows <= 20%.
            if df[col].nunique() / df[col].size * 100 <= 20:
                df[col] = df[col].astype("category")

    end_mem = df.memory_usage(deep=True).sum() / 1024**2
    print(f"Memory usage after optimization is: {end_mem:.2f} MB")
    print(f"Decreased by {100 * (start_mem - end_mem) / start_mem:.1f}%")
    return df
	def df_reduce_memory(df):
	"""Reduce pandas dataframe memory size

	Args:
	df (pd.DataFrame): pandas dataframe

	Returns:
	pd.DataFrame: reduced pandas dataframe
	"""
	# Example: df = pd.read_csv(data_dir, parse_dates=True, keep_date_col=True)

	start_mem = df.memory_usage(deep=True).sum() / 1024**2
	print(f"Memory usage of dataframe is {start_mem:.2f} MB")

	for col in df.columns:
	col_type = df[col].dtype
	if col_type != object:
	c_min = df[col].min()
	c_max = df[col].max()
	if str(col_type)[:3] == "int":
	for int_type in (np.int8, np.int16, np.int32, np.int64):
	if np.iinfo(int_type).min < c_min and np.iinfo(int_type).max > c_max:
	df[col] = df[col].astype(int_type)
	break
	else:
	for float_type in (np.float16, np.float32, np.float64):
	if np.finfo(float_type).min < c_min and np.finfo(float_type).max > c_max:
	df[col] = df[col].astype(float_type)
	break
	else:
	# if column type = object. Convert to category if unique rows <= 20%.
	if df[col].nunique() / df[col].size * 100 <= 20:
	df[col] = df[col].astype("category")

	end_mem = df.memory_usage(deep=True).sum() / 1024**2
	print(f"Memory usage after optimization is: {end_mem:.2f} MB")
	print(f"Decreased by {100 * (start_mem - end_mem) / start_mem:.1f}%")
	return df