Created
June 14, 2023 20:03
-
-
Save lucaspar/dfc4dec68ebb430ce96689b3d827d415 to your computer and use it in GitHub Desktop.
Benchmark | PyArrow conversions
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
"""Benchmark the conversion of pyarrow arrays to polars, numpy, and lists.""" | |
import time | |
from typing import Callable | |
import numpy as np | |
import pandas as pd | |
import polars as pl | |
import psutil | |
import pyarrow as pa | |
def get_memory_usage() -> float: | |
"""Returns the memory usage of the current process, in bytes.""" | |
process = psutil.Process() | |
physical_mem = process.memory_info().rss | |
# virtual_mem = process.memory_info().vms | |
return physical_mem | |
def pretty_memory_usage(mem_bytes: float) -> str: | |
"""Pretty prints the memory usage in KB, MB, GB, etc.""" | |
unit = "B" | |
for unit in ["", "K", "M", "G", "T", "P"]: | |
if mem_bytes < 1_024: | |
break | |
mem_bytes /= 1_024 | |
# round to two decimal places | |
mem_bytes = round(mem_bytes, 2) | |
return f"{mem_bytes} {unit}B" | |
def measure_memory(func: Callable) -> Callable: | |
"""Decorator to measure the memory difference of a function.""" | |
def wrapper(*args, **kwargs): | |
current_memory = get_memory_usage() | |
res = func(*args, **kwargs) | |
memory_increase = get_memory_usage() - current_memory | |
# to make sure res is actually in memory | |
assert res is not None and len(res) > 0, "Result is empty" | |
print(f"\tMemory increase: {pretty_memory_usage(memory_increase)}") | |
return res | |
return wrapper | |
@measure_memory | |
def setup(arrow_size: int = 50_000_000, target_type: type = float) -> pa.Array: | |
"""Creates a pyarrow array of the given size.""" | |
if target_type == float: | |
arr = pa.array(np.random.rand(arrow_size)) | |
elif target_type == int: | |
arr = pa.array(np.random.randint(0, 100, arrow_size)) | |
else: | |
raise ValueError(f"Unknown type: {target_type}") | |
return arr | |
@measure_memory | |
def convert_to_polars(arr: pa.Array) -> pl.Series: | |
"""Converts the array to a polars series.""" | |
polar_series = pl.from_arrow(arr) | |
assert len(polar_series) == len(arr), "Lengths do not match" | |
assert isinstance(polar_series, pl.Series), "Not a polars series" | |
return polar_series | |
@measure_memory | |
def convert_to_numpy(arr: pa.Array) -> np.ndarray: | |
"""Converts the array to a numpy array.""" | |
np_array = arr.to_numpy() | |
assert len(np_array) == len(arr), "Lengths do not match" | |
return np_array | |
@measure_memory | |
def convert_to_list(arr: pa.Array) -> list: | |
"""Converts the array to a list.""" | |
list_ = arr.tolist() | |
assert len(list_) == len(arr), "Lengths do not match" | |
return list_ | |
@measure_memory | |
def convert_to_pandas(arr: pa.Array) -> pd.Series: | |
"""Converts the array to a pandas series.""" | |
pandas_series = arr.to_pandas() | |
assert len(pandas_series) == len(arr), "Lengths do not match" | |
return pandas_series | |
def benchmark(name: str, callback: Callable, kwargs: dict): | |
"""Runs the benchmark for the given callback.""" | |
print(f"Testing {name}:") | |
start = time.time() | |
callback(**kwargs) | |
print( | |
f"\tConversion time from pyarrow to {name}:\t{time.time() - start:.4f} seconds" | |
) | |
def main(): | |
"""Run the benchmark for each conversion.""" | |
arr = setup(arrow_size=50_000_000, target_type=float) | |
print(f"Created array with size: {len(arr):,} and type {arr.type}") | |
kwargs = {"arr": arr} | |
tests = [ | |
dict(name="polars", callback=convert_to_polars, kwargs=kwargs), | |
dict(name="numpy", callback=convert_to_numpy, kwargs=kwargs), | |
dict(name="list", callback=convert_to_list, kwargs=kwargs), | |
dict(name="pandas", callback=convert_to_pandas, kwargs=kwargs), | |
] | |
for test in tests: | |
benchmark(**test) | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment