laurent-laporte-pro · August 30, 2023 07:58
diff --git a/big_matrix_benchmark.py b/big_matrix_benchmark.py
 import json
 import multiprocessing
 import sys
 import time
 import typing as t
 import typing_extensions as te
 from pathlib import Path

 import numpy as np
 import pandas as pd

 HERE = Path(__file__).parent.resolve()


 Shape = t.Tuple[int, int]
 FileFormat = te.Literal["csv", "xlsx", "json", "npz"]
 StatTable = t.Dict[Shape, t.Dict[FileFormat, float]]


 class Stat(t.NamedTuple):
    file_format: FileFormat
    shape: Shape
    save_duration: float
    load_duration: float
    size: int


 def benchmark_excel(array):
    xlsx_path = HERE.joinpath(
        f"big_excel_matrix_{array.shape[0]:04d}-{array.shape[1]:04d}.xlsx"
    )
    df = pd.DataFrame(array)
    timestamp = time.time()
    df.to_excel(xlsx_path, index=False, header=False)
    save_duration = time.time() - timestamp
    timestamp = time.time()
    pd.read_excel(xlsx_path)
    load_duration = time.time() - timestamp
    return Stat(
        file_format="xlsx",
        shape=array.shape,
        save_duration=save_duration,
        load_duration=load_duration,
        size=xlsx_path.stat().st_size,
    )


 def benchmark_json(array):
    json_path = HERE.joinpath(
        f"big_json_matrix_{array.shape[0]:04d}-{array.shape[1]:04d}.json"
    )
    obj = {
        "index": list(range(8760)),
        "columns": list(range(array.shape[1])),
        "data": array.tolist(),
    }
    timestamp = time.time()
    with json_path.open(mode="w", encoding="utf-8") as fd:
        fd.write(json.dumps(obj, indent=True))
    save_duration = time.time() - timestamp
    timestamp = time.time()
    with json_path.open(mode="r", encoding="utf-8") as fd:
        json.loads(fd.read())
    load_duration = time.time() - timestamp
    return Stat(
        file_format="json",
        shape=array.shape,
        save_duration=save_duration,
        load_duration=load_duration,
        size=json_path.stat().st_size,
    )


 def benchmark_csv(array):
    csv_path = HERE.joinpath(
        f"big_csv_matrix_{array.shape[0]:04d}-{array.shape[1]:04d}.csv"
    )
    timestamp = time.time()
    np.savetxt(csv_path, array, delimiter="\t", fmt="%.6f")
    save_duration = time.time() - timestamp
    timestamp = time.time()
    np.loadtxt(csv_path, delimiter="\t", dtype=np.float64, ndmin=2)
    load_duration = time.time() - timestamp
    return Stat(
        file_format="csv",
        shape=array.shape,
        save_duration=save_duration,
        load_duration=load_duration,
        size=csv_path.stat().st_size,
    )


 def benchmark_npz(array):
    npz_path = HERE.joinpath(
        f"big_npz_matrix_{array.shape[0]:04d}-{array.shape[1]:04d}.npz"
    )
    timestamp = time.time()
    np.savez(npz_path, array1=array)
    save_duration = time.time() - timestamp
    timestamp = time.time()
    # noinspection PyStatementEffect
    np.load(npz_path)["array1"]
    load_duration = time.time() - timestamp
    return Stat(
        file_format="npz",
        shape=array.shape,
        save_duration=save_duration,
        load_duration=load_duration,
        size=npz_path.stat().st_size,
    )


 def _print_table(stat_table: StatTable, *, file=sys.stdout, unit=1):
    # extract header
    headers = sorted(
        {c for shape, values in stat_table.items() for c in values}
    )
    print("|      | " + " | ".join(headers) + " |", file=file)
    print(
        "|------|-" + ":|-".join("-" * len(h) for h in headers) + ":|",
        file=file,
    )
    for shape, values in sorted(stat_table.items()):
        line = [shape] + [
            f"{values[header] / unit:0.3f}" for header in headers
        ]
        print("| " + " | ".join(map(str, line)) + " |", file=file)


 def create_big_matrix() -> None:
    shapes = [
        (365, 1),
        (365, 100),
        (365, 1000),
        (8760, 1),
        (8760, 10),
        (8760, 100),
        (8760, 200),
        (8760, 1000),
        (8760, 2000),
    ]
    save_durations: StatTable = {
        s: {"xlsx": 0, "json": 0, "csv": 0, "npz": 0} for s in shapes
    }
    load_durations: StatTable = {
        s: {"xlsx": 0, "json": 0, "csv": 0, "npz": 0} for s in shapes
    }
    sizes: StatTable = {
        s: {"xlsx": 0, "json": 0, "csv": 0, "npz": 0} for s in shapes
    }
    with multiprocessing.Pool(processes=20) as pool:
        results = []
        for shape in shapes:
            array = np.random.rand(*shape) * 1000
            results.append(pool.apply_async(benchmark_csv, (array,)))
            results.append(pool.apply_async(benchmark_json, (array,)))
            results.append(pool.apply_async(benchmark_excel, (array,)))
            results.append(pool.apply_async(benchmark_npz, (array,)))
        for res in results:
            stat = res.get()
            save_durations[stat.shape][stat.file_format] = stat.save_duration
            load_durations[stat.shape][stat.file_format] = stat.load_duration
            sizes[stat.shape][stat.file_format] = stat.size

    report_path = HERE.joinpath(f"report.md")
    with report_path.open(mode="w", encoding="utf-8") as report:
        print("# Big Matrices Serialization Benchmark", file=report)
        print(file=report)
        print("## Saving duration (s)", file=report)
        print(file=report)
        _print_table(save_durations, file=report)
        print(file=report)
        print("## Loading duration (s)", file=report)
        print(file=report)
        _print_table(load_durations, file=report)
        print(file=report)
        print(file=report)
        print("## File size (Mo)", file=report)
        _print_table(sizes, file=report, unit=1024 * 1024)


 if __name__ == "__main__":
    create_big_matrix()
	import json
	import multiprocessing
	import sys
	import time
	import typing as t
	import typing_extensions as te
	from pathlib import Path

	import numpy as np
	import pandas as pd

	HERE = Path(__file__).parent.resolve()


	Shape = t.Tuple[int, int]
	FileFormat = te.Literal["csv", "xlsx", "json", "npz"]
	StatTable = t.Dict[Shape, t.Dict[FileFormat, float]]


	class Stat(t.NamedTuple):
	file_format: FileFormat
	shape: Shape
	save_duration: float
	load_duration: float
	size: int


	def benchmark_excel(array):
	xlsx_path = HERE.joinpath(
	f"big_excel_matrix_{array.shape[0]:04d}-{array.shape[1]:04d}.xlsx"
	)
	df = pd.DataFrame(array)
	timestamp = time.time()
	df.to_excel(xlsx_path, index=False, header=False)
	save_duration = time.time() - timestamp
	timestamp = time.time()
	pd.read_excel(xlsx_path)
	load_duration = time.time() - timestamp
	return Stat(
	file_format="xlsx",
	shape=array.shape,
	save_duration=save_duration,
	load_duration=load_duration,
	size=xlsx_path.stat().st_size,
	)


	def benchmark_json(array):
	json_path = HERE.joinpath(
	f"big_json_matrix_{array.shape[0]:04d}-{array.shape[1]:04d}.json"
	)
	obj = {
	"index": list(range(8760)),
	"columns": list(range(array.shape[1])),
	"data": array.tolist(),
	}
	timestamp = time.time()
	with json_path.open(mode="w", encoding="utf-8") as fd:
	fd.write(json.dumps(obj, indent=True))
	save_duration = time.time() - timestamp
	timestamp = time.time()
	with json_path.open(mode="r", encoding="utf-8") as fd:
	json.loads(fd.read())
	load_duration = time.time() - timestamp
	return Stat(
	file_format="json",
	shape=array.shape,
	save_duration=save_duration,
	load_duration=load_duration,
	size=json_path.stat().st_size,
	)


	def benchmark_csv(array):
	csv_path = HERE.joinpath(
	f"big_csv_matrix_{array.shape[0]:04d}-{array.shape[1]:04d}.csv"
	)
	timestamp = time.time()
	np.savetxt(csv_path, array, delimiter="\t", fmt="%.6f")
	save_duration = time.time() - timestamp
	timestamp = time.time()
	np.loadtxt(csv_path, delimiter="\t", dtype=np.float64, ndmin=2)
	load_duration = time.time() - timestamp
	return Stat(
	file_format="csv",
	shape=array.shape,
	save_duration=save_duration,
	load_duration=load_duration,
	size=csv_path.stat().st_size,
	)


	def benchmark_npz(array):
	npz_path = HERE.joinpath(
	f"big_npz_matrix_{array.shape[0]:04d}-{array.shape[1]:04d}.npz"
	)
	timestamp = time.time()
	np.savez(npz_path, array1=array)
	save_duration = time.time() - timestamp
	timestamp = time.time()
	# noinspection PyStatementEffect
	np.load(npz_path)["array1"]
	load_duration = time.time() - timestamp
	return Stat(
	file_format="npz",
	shape=array.shape,
	save_duration=save_duration,
	load_duration=load_duration,
	size=npz_path.stat().st_size,
	)


	def _print_table(stat_table: StatTable, *, file=sys.stdout, unit=1):
	# extract header
	headers = sorted(
	{c for shape, values in stat_table.items() for c in values}
	)
	print("\| \| " + " \| ".join(headers) + " \|", file=file)
	print(
	"\|------\|-" + ":\|-".join("-" * len(h) for h in headers) + ":\|",
	file=file,
	)
	for shape, values in sorted(stat_table.items()):
	line = [shape] + [
	f"{values[header] / unit:0.3f}" for header in headers
	]
	print("\| " + " \| ".join(map(str, line)) + " \|", file=file)


	def create_big_matrix() -> None:
	shapes = [
	(365, 1),
	(365, 100),
	(365, 1000),
	(8760, 1),
	(8760, 10),
	(8760, 100),
	(8760, 200),
	(8760, 1000),
	(8760, 2000),
	]
	save_durations: StatTable = {
	s: {"xlsx": 0, "json": 0, "csv": 0, "npz": 0} for s in shapes
	}
	load_durations: StatTable = {
	s: {"xlsx": 0, "json": 0, "csv": 0, "npz": 0} for s in shapes
	}
	sizes: StatTable = {
	s: {"xlsx": 0, "json": 0, "csv": 0, "npz": 0} for s in shapes
	}
	with multiprocessing.Pool(processes=20) as pool:
	results = []
	for shape in shapes:
	array = np.random.rand(shape) 1000
	results.append(pool.apply_async(benchmark_csv, (array,)))
	results.append(pool.apply_async(benchmark_json, (array,)))
	results.append(pool.apply_async(benchmark_excel, (array,)))
	results.append(pool.apply_async(benchmark_npz, (array,)))
	for res in results:
	stat = res.get()
	save_durations[stat.shape][stat.file_format] = stat.save_duration
	load_durations[stat.shape][stat.file_format] = stat.load_duration
	sizes[stat.shape][stat.file_format] = stat.size

	report_path = HERE.joinpath(f"report.md")
	with report_path.open(mode="w", encoding="utf-8") as report:
	print("# Big Matrices Serialization Benchmark", file=report)
	print(file=report)
	print("## Saving duration (s)", file=report)
	print(file=report)
	_print_table(save_durations, file=report)
	print(file=report)
	print("## Loading duration (s)", file=report)
	print(file=report)
	_print_table(load_durations, file=report)
	print(file=report)
	print(file=report)
	print("## File size (Mo)", file=report)
	_print_table(sizes, file=report, unit=1024 * 1024)


	if __name__ == "__main__":
	create_big_matrix()