Skip to content

Instantly share code, notes, and snippets.

@mh0w
Last active January 25, 2024 17:15
Show Gist options
  • Save mh0w/59500059c7ac0a05d9949652f7a7d0b9 to your computer and use it in GitHub Desktop.
Save mh0w/59500059c7ac0a05d9949652f7a7d0b9 to your computer and use it in GitHub Desktop.
Check and cast to different data types
import pandas as pd # astype(), info(), memory_usage(), to_numeric()
import polars as pl # cast(), estimates_size()
import numpy as np
import pdcast as pdc # install via pip install pandas-downcast
pandas_df = pd.DataFrame(
{
"a": np.linspace(1, 10_000_000, 10_000_000),
"b": np.linspace(1.12, 10_000_000.12, 10_000_000),
"c": np.random.choice([1, 0], 10_000_000),
"d": np.random.choice(["foo", "bar", "foobar"], 10_000_000),
}
)
polars_df = pl.DataFrame(pandas_df)
# Optional function to convert output of .memory_usage() to a more human-readable format
def make_human_readable(nbytes):
suffixes = ["B", "KB", "MB", "GB", "TB", "PB"]
i = 0
while nbytes >= 1024 and i < len(suffixes) - 1:
nbytes /= 1024.0
i += 1
f = ("%.2f" % nbytes).rstrip("0").rstrip(".")
return "%s %s" % (f, suffixes[i])
# Check memory usage
pandas_df.info() # Memory use of df overall, plus column dtypes
polars_df.estimated_size(unit="mb") # Size of polars df
pandas_df.memory_usage(deep=True).apply(make_human_readable) # Memory usage per column
# Check if floats have digits after decimal point (i.e. whether can convert to int without losing info)
(pandas_df["a"] % 1 == 0).all() # True
(pandas_df["b"] % 1 == 0).all() # False
# Using pandas-downcast (pdc)
# Floats will remain equal within np.allclose() comparison tolerance
pandas_df_pdc = pdc.downcast(pandas_df)
pandas_df_pdc.info() # --> 95.4 MB (down from 267 MB)
pandas_df_pdc.memory_usage(deep=True).apply(make_human_readable)
"""
pandas_df
# Column Dtype
--- ------ -----
0 a float64
1 b float64
2 c int32
3 d object
dtypes: float64(2), int32(1), object(1)
memory usage: 267.0+ MB
pandas_df_pdc
# Column Dtype
--- ------ -----
0 a uint32
1 b float32
2 c bool
3 d category
dtypes: bool(1), category(1), float32(1), uint32(1)
memory usage: 95.4 MB
"""
# Using pd.to_numeric()
pandas_df["id"] = pd.to_numeric(pandas_df["id"], downcast="unsigned")
pandas_df[["x", "y"]] = pandas_df[["x", "y"]].apply(pd.to_numeric, downcast="float")
# Using .astype() to explicitly choose dtypes
pandas_df_astyped = pandas_df.astype(
{"a": "uint32", "b": "float32", "c": "bool", "d": "category"}
)
polars_df_astyped = polars_df.cast(
{"a": pl.UInt32, "b": pl.Float32(), "c": pl.Boolean, "d": pl.Categorical}
)
# Cast during import
pd.read_excel(
f"{geo_lookups}/2021/OA11_WD21_LAD21_EW_LU.xlsx",
usecols=["OA11CD", "WD21CD", "WD21NM"], dtype="categorical"
)
pd.read_excel(
f"{geo_lookups}/2021/OA11_WD21_LAD21_EW_LU.xlsx",
usecols=["OA11CD", "WD21CD", "WD21NM"], dtype={"OA11CD": "category", "WD21CD": "string"}
)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment