mh0w · January 25, 2024 17:15
diff --git a/Check and (down) cast to different data types.py b/Check and (down) cast to different data types.py
 import pandas as pd  # astype(), info(), memory_usage(), to_numeric()
 import polars as pl  # cast(), estimates_size()
 import numpy as np
 import pdcast as pdc  # install via pip install pandas-downcast

 pandas_df = pd.DataFrame(
    {
        "a": np.linspace(1, 10_000_000, 10_000_000),
        "b": np.linspace(1.12, 10_000_000.12, 10_000_000),
        "c": np.random.choice([1, 0], 10_000_000),
        "d": np.random.choice(["foo", "bar", "foobar"], 10_000_000),
    }
 )

 polars_df = pl.DataFrame(pandas_df)


 # Optional function to convert output of .memory_usage() to a more human-readable format
 def make_human_readable(nbytes):
    suffixes = ["B", "KB", "MB", "GB", "TB", "PB"]
    i = 0
    while nbytes >= 1024 and i < len(suffixes) - 1:
        nbytes /= 1024.0
        i += 1
    f = ("%.2f" % nbytes).rstrip("0").rstrip(".")
    return "%s %s" % (f, suffixes[i])


 # Check memory usage
 pandas_df.info()  # Memory use of df overall, plus column dtypes
 polars_df.estimated_size(unit="mb")  # Size of polars df
 pandas_df.memory_usage(deep=True).apply(make_human_readable)  # Memory usage per column

 # Check if floats have digits after decimal point (i.e. whether can convert to int without losing info)
 (pandas_df["a"] % 1 == 0).all()  # True
 (pandas_df["b"] % 1 == 0).all()  # False

 # Using pandas-downcast (pdc)
 # Floats will remain equal within np.allclose() comparison tolerance
 pandas_df_pdc = pdc.downcast(pandas_df)
 pandas_df_pdc.info()  # --> 95.4 MB (down from 267 MB)
 pandas_df_pdc.memory_usage(deep=True).apply(make_human_readable)

 """
 pandas_df
 #   Column  Dtype
 ---  ------  -----
 0   a       float64
 1   b       float64
 2   c       int32
 3   d       object
 dtypes: float64(2), int32(1), object(1)
 memory usage: 267.0+ MB

 pandas_df_pdc
 #   Column  Dtype
 ---  ------  -----
 0   a       uint32
 1   b       float32
 2   c       bool
 3   d       category
 dtypes: bool(1), category(1), float32(1), uint32(1)
 memory usage: 95.4 MB
 """

 # Using pd.to_numeric()
 pandas_df["id"] = pd.to_numeric(pandas_df["id"], downcast="unsigned")
 pandas_df[["x", "y"]] = pandas_df[["x", "y"]].apply(pd.to_numeric, downcast="float")

 # Using .astype() to explicitly choose dtypes
 pandas_df_astyped = pandas_df.astype(
    {"a": "uint32", "b": "float32", "c": "bool", "d": "category"}
 )
 polars_df_astyped = polars_df.cast(
    {"a": pl.UInt32, "b": pl.Float32(), "c": pl.Boolean, "d": pl.Categorical}
 )

 # Cast during import
 pd.read_excel(
        f"{geo_lookups}/2021/OA11_WD21_LAD21_EW_LU.xlsx",
        usecols=["OA11CD", "WD21CD", "WD21NM"], dtype="categorical"
    )
 pd.read_excel(
        f"{geo_lookups}/2021/OA11_WD21_LAD21_EW_LU.xlsx",
        usecols=["OA11CD", "WD21CD", "WD21NM"], dtype={"OA11CD": "category", "WD21CD": "string"}
    )
	import pandas as pd # astype(), info(), memory_usage(), to_numeric()
	import polars as pl # cast(), estimates_size()
	import numpy as np
	import pdcast as pdc # install via pip install pandas-downcast

	pandas_df = pd.DataFrame(
	{
	"a": np.linspace(1, 10_000_000, 10_000_000),
	"b": np.linspace(1.12, 10_000_000.12, 10_000_000),
	"c": np.random.choice([1, 0], 10_000_000),
	"d": np.random.choice(["foo", "bar", "foobar"], 10_000_000),
	}
	)

	polars_df = pl.DataFrame(pandas_df)


	# Optional function to convert output of .memory_usage() to a more human-readable format
	def make_human_readable(nbytes):
	suffixes = ["B", "KB", "MB", "GB", "TB", "PB"]
	i = 0
	while nbytes >= 1024 and i < len(suffixes) - 1:
	nbytes /= 1024.0
	i += 1
	f = ("%.2f" % nbytes).rstrip("0").rstrip(".")
	return "%s %s" % (f, suffixes[i])


	# Check memory usage
	pandas_df.info() # Memory use of df overall, plus column dtypes
	polars_df.estimated_size(unit="mb") # Size of polars df
	pandas_df.memory_usage(deep=True).apply(make_human_readable) # Memory usage per column

	# Check if floats have digits after decimal point (i.e. whether can convert to int without losing info)
	(pandas_df["a"] % 1 == 0).all() # True
	(pandas_df["b"] % 1 == 0).all() # False

	# Using pandas-downcast (pdc)
	# Floats will remain equal within np.allclose() comparison tolerance
	pandas_df_pdc = pdc.downcast(pandas_df)
	pandas_df_pdc.info() # --> 95.4 MB (down from 267 MB)
	pandas_df_pdc.memory_usage(deep=True).apply(make_human_readable)

	"""
	pandas_df
	# Column Dtype
	--- ------ -----
	0 a float64
	1 b float64
	2 c int32
	3 d object
	dtypes: float64(2), int32(1), object(1)
	memory usage: 267.0+ MB

	pandas_df_pdc
	# Column Dtype
	--- ------ -----
	0 a uint32
	1 b float32
	2 c bool
	3 d category
	dtypes: bool(1), category(1), float32(1), uint32(1)
	memory usage: 95.4 MB
	"""

	# Using pd.to_numeric()
	pandas_df["id"] = pd.to_numeric(pandas_df["id"], downcast="unsigned")
	pandas_df[["x", "y"]] = pandas_df[["x", "y"]].apply(pd.to_numeric, downcast="float")

	# Using .astype() to explicitly choose dtypes
	pandas_df_astyped = pandas_df.astype(
	{"a": "uint32", "b": "float32", "c": "bool", "d": "category"}
	)
	polars_df_astyped = polars_df.cast(
	{"a": pl.UInt32, "b": pl.Float32(), "c": pl.Boolean, "d": pl.Categorical}
	)

	# Cast during import
	pd.read_excel(
	f"{geo_lookups}/2021/OA11_WD21_LAD21_EW_LU.xlsx",
	usecols=["OA11CD", "WD21CD", "WD21NM"], dtype="categorical"
	)
	pd.read_excel(
	f"{geo_lookups}/2021/OA11_WD21_LAD21_EW_LU.xlsx",
	usecols=["OA11CD", "WD21CD", "WD21NM"], dtype={"OA11CD": "category", "WD21CD": "string"}
	)