Last active
August 30, 2023 07:58
-
-
Save laurent-laporte-pro/ba5c0cd7914fafe30c921f0b80cac1d8 to your computer and use it in GitHub Desktop.
2D-NumPy Matrix Benchmark: Measuring Read/Write Times and Memory Usage
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import json | |
import multiprocessing | |
import sys | |
import time | |
import typing as t | |
import typing_extensions as te | |
from pathlib import Path | |
import numpy as np | |
import pandas as pd | |
HERE = Path(__file__).parent.resolve() | |
Shape = t.Tuple[int, int] | |
FileFormat = te.Literal["csv", "xlsx", "json", "npz"] | |
StatTable = t.Dict[Shape, t.Dict[FileFormat, float]] | |
class Stat(t.NamedTuple): | |
file_format: FileFormat | |
shape: Shape | |
save_duration: float | |
load_duration: float | |
size: int | |
def benchmark_excel(array): | |
xlsx_path = HERE.joinpath( | |
f"big_excel_matrix_{array.shape[0]:04d}-{array.shape[1]:04d}.xlsx" | |
) | |
df = pd.DataFrame(array) | |
timestamp = time.time() | |
df.to_excel(xlsx_path, index=False, header=False) | |
save_duration = time.time() - timestamp | |
timestamp = time.time() | |
pd.read_excel(xlsx_path) | |
load_duration = time.time() - timestamp | |
return Stat( | |
file_format="xlsx", | |
shape=array.shape, | |
save_duration=save_duration, | |
load_duration=load_duration, | |
size=xlsx_path.stat().st_size, | |
) | |
def benchmark_json(array): | |
json_path = HERE.joinpath( | |
f"big_json_matrix_{array.shape[0]:04d}-{array.shape[1]:04d}.json" | |
) | |
obj = { | |
"index": list(range(8760)), | |
"columns": list(range(array.shape[1])), | |
"data": array.tolist(), | |
} | |
timestamp = time.time() | |
with json_path.open(mode="w", encoding="utf-8") as fd: | |
fd.write(json.dumps(obj, indent=True)) | |
save_duration = time.time() - timestamp | |
timestamp = time.time() | |
with json_path.open(mode="r", encoding="utf-8") as fd: | |
json.loads(fd.read()) | |
load_duration = time.time() - timestamp | |
return Stat( | |
file_format="json", | |
shape=array.shape, | |
save_duration=save_duration, | |
load_duration=load_duration, | |
size=json_path.stat().st_size, | |
) | |
def benchmark_csv(array): | |
csv_path = HERE.joinpath( | |
f"big_csv_matrix_{array.shape[0]:04d}-{array.shape[1]:04d}.csv" | |
) | |
timestamp = time.time() | |
np.savetxt(csv_path, array, delimiter="\t", fmt="%.6f") | |
save_duration = time.time() - timestamp | |
timestamp = time.time() | |
np.loadtxt(csv_path, delimiter="\t", dtype=np.float64, ndmin=2) | |
load_duration = time.time() - timestamp | |
return Stat( | |
file_format="csv", | |
shape=array.shape, | |
save_duration=save_duration, | |
load_duration=load_duration, | |
size=csv_path.stat().st_size, | |
) | |
def benchmark_npz(array): | |
npz_path = HERE.joinpath( | |
f"big_npz_matrix_{array.shape[0]:04d}-{array.shape[1]:04d}.npz" | |
) | |
timestamp = time.time() | |
np.savez(npz_path, array1=array) | |
save_duration = time.time() - timestamp | |
timestamp = time.time() | |
# noinspection PyStatementEffect | |
np.load(npz_path)["array1"] | |
load_duration = time.time() - timestamp | |
return Stat( | |
file_format="npz", | |
shape=array.shape, | |
save_duration=save_duration, | |
load_duration=load_duration, | |
size=npz_path.stat().st_size, | |
) | |
def _print_table(stat_table: StatTable, *, file=sys.stdout, unit=1): | |
# extract header | |
headers = sorted( | |
{c for shape, values in stat_table.items() for c in values} | |
) | |
print("| | " + " | ".join(headers) + " |", file=file) | |
print( | |
"|------|-" + ":|-".join("-" * len(h) for h in headers) + ":|", | |
file=file, | |
) | |
for shape, values in sorted(stat_table.items()): | |
line = [shape] + [ | |
f"{values[header] / unit:0.3f}" for header in headers | |
] | |
print("| " + " | ".join(map(str, line)) + " |", file=file) | |
def create_big_matrix() -> None: | |
shapes = [ | |
(365, 1), | |
(365, 100), | |
(365, 1000), | |
(8760, 1), | |
(8760, 10), | |
(8760, 100), | |
(8760, 200), | |
(8760, 1000), | |
(8760, 2000), | |
] | |
save_durations: StatTable = { | |
s: {"xlsx": 0, "json": 0, "csv": 0, "npz": 0} for s in shapes | |
} | |
load_durations: StatTable = { | |
s: {"xlsx": 0, "json": 0, "csv": 0, "npz": 0} for s in shapes | |
} | |
sizes: StatTable = { | |
s: {"xlsx": 0, "json": 0, "csv": 0, "npz": 0} for s in shapes | |
} | |
with multiprocessing.Pool(processes=20) as pool: | |
results = [] | |
for shape in shapes: | |
array = np.random.rand(*shape) * 1000 | |
results.append(pool.apply_async(benchmark_csv, (array,))) | |
results.append(pool.apply_async(benchmark_json, (array,))) | |
results.append(pool.apply_async(benchmark_excel, (array,))) | |
results.append(pool.apply_async(benchmark_npz, (array,))) | |
for res in results: | |
stat = res.get() | |
save_durations[stat.shape][stat.file_format] = stat.save_duration | |
load_durations[stat.shape][stat.file_format] = stat.load_duration | |
sizes[stat.shape][stat.file_format] = stat.size | |
report_path = HERE.joinpath(f"report.md") | |
with report_path.open(mode="w", encoding="utf-8") as report: | |
print("# Big Matrices Serialization Benchmark", file=report) | |
print(file=report) | |
print("## Saving duration (s)", file=report) | |
print(file=report) | |
_print_table(save_durations, file=report) | |
print(file=report) | |
print("## Loading duration (s)", file=report) | |
print(file=report) | |
_print_table(load_durations, file=report) | |
print(file=report) | |
print(file=report) | |
print("## File size (Mo)", file=report) | |
_print_table(sizes, file=report, unit=1024 * 1024) | |
if __name__ == "__main__": | |
create_big_matrix() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment