Skip to content

Instantly share code, notes, and snippets.

@DeflateAwning
Created March 25, 2025 21:05
Show Gist options
  • Save DeflateAwning/dd19fd9089e7529b6d26322c4aed042d to your computer and use it in GitHub Desktop.
Save DeflateAwning/dd19fd9089e7529b6d26322c4aed042d to your computer and use it in GitHub Desktop.
Benchmark the conversion of Numpy to Polars vs. Numpy to Pandas to Polars
"""
Related to https://github.com/pydata/xarray/issues/10135
Result (in VS Code debugger):
Shape | NumPy → Polars | NumPy → Pandas → Polars
-----------------------------------------------------------------
10,000 x 10 | 0.003997 s | 0.033097 s
10,000 x 200 | 0.002424 s | 0.050915 s
100,000 x 10 | 0.000278 s | 0.021541 s
100,000 x 200 | 0.002266 s | 0.090521 s
1,000,000 x 10 | 0.000154 s | 0.045166 s
1,000,000 x 200 | 0.002373 s | 0.466811 s
10,000,000 x 10 | 0.000161 s | 0.254206 s
Result (in normal run):
Shape | NumPy → Polars | NumPy → Pandas → Polars
-----------------------------------------------------------------
10,000 x 10 | 0.000153 s | 0.006164 s
10,000 x 200 | 0.000791 s | 0.021460 s
100,000 x 10 | 0.000071 s | 0.007551 s
100,000 x 200 | 0.000794 s | 0.050238 s
1,000,000 x 10 | 0.000062 s | 0.024121 s
1,000,000 x 200 | 0.000781 s | 0.383526 s
10,000,000 x 10 | 0.000063 s | 0.232983 s
"""
import numpy as np
import pandas as pd
import polars as pl
import timeit
# Array shapes to test
shapes = [
(10_000, 10),
(10_000, 200),
(100_000, 10),
(100_000, 200),
(1_000_000, 10),
(1_000_000, 200),
(10_000_000, 10),
]
REPEATS = 5
def time_numpy_to_polars(arr: dict):
def fn():
df_pl = pl.DataFrame(arr)
assert df_pl.height > 1000
assert len(df_pl.columns) in (10, 200)
return df_pl
return timeit.timeit(fn, number=REPEATS) / REPEATS
def time_numpy_to_pandas_to_polars(arr):
def fn():
df = pd.DataFrame(arr)
df_pl = pl.from_pandas(df, rechunk=True)
assert df_pl.height > 1000
assert len(df_pl.columns) in (10, 200)
del df
return df_pl
return timeit.timeit(fn, number=REPEATS) / REPEATS
def benchmark():
print(f"{'Shape':>15} | {'NumPy → Polars':>18} | {'NumPy → Pandas → Polars':>23}")
print("-" * 65)
for row_count, col_count in shapes:
arr1 = {f"col_{i}": np.random.rand(row_count) for i in range(col_count)}
t_np_pd_polars = time_numpy_to_pandas_to_polars(arr1)
del arr1
arr2 = {f"col_{i}": np.random.rand(row_count) for i in range(col_count)}
t_np_polars = time_numpy_to_polars(arr2)
del arr2
shape_str = f"{row_count:,} x {col_count}"
print(f"{shape_str:>15} | {t_np_polars:>18.6f} s | {t_np_pd_polars:>23.6f} s")
for _ in range(5):
benchmark()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment