Skip to content

Instantly share code, notes, and snippets.

@fschwar4
Created May 1, 2025 22:01
Show Gist options
  • Save fschwar4/768f86cba465fd3a613334516b0192df to your computer and use it in GitHub Desktop.
Save fschwar4/768f86cba465fd3a613334516b0192df to your computer and use it in GitHub Desktop.
Numpy overhead for array loading with different file systems?
import gc
import os
import subprocess
import time
import memray
import numpy as np
import numpy.lib.format
from datetime import datetime as dt
def read_npy_header(file_path):
with open(file_path, 'rb') as f:
version = np.lib.format.read_magic(f)
if version == (1, 0):
shape, fortran_order, dtype = np.lib.format.read_array_header_1_0(f)
elif version == (2, 0):
shape, fortran_order, dtype = np.lib.format.read_array_header_2_0(f)
else:
raise ValueError(f"Unsupported .npy file version: {version}")
return version, shape, fortran_order, dtype
def test(path_):
subprocess.run(['rm', '-rf', path_])
rng_ = np.random.default_rng(413284713)
pow_ = [26, 27, 28, 30, 31, 32, 33, 34, 35] # end at 32 for cvfs
print('START', dt.now())
time.sleep(1)
for i in pow_:
print(i, dt.now())
m_ = rng_.random(size=2**i, dtype=np.float64)
np.save(path_, m_)
print('saved:', dt.now())
print('file size (GB):', os.path.getsize(path_) / 1024**3)
print('header:', read_npy_header(path_))
time.sleep(1)
print('before del', dt.now())
del m_
gc.collect()
print('after del', dt.now())
time.sleep(1)
print('start loading', dt.now())
m_ = np.load(path_)
print('loaded:', dt.now())
print('size (GB)', m_.nbytes / 1024**3)
time.sleep(1)
print('remove all', dt.now())
del m_
gc.collect()
subprocess.run(['rm', '-rf', path_])
print('removed:', dt.now(), end='\n\n\n')
time.sleep(1)
print('END', dt.now())
if __name__ == "__main__":
with memray.Tracker(
"memray_test_cn.bin",
memory_interval_ms=0.1,
follow_fork=True,
native_traces=True
):
path_ = os.path.join('beegfs or cvfs location', 'test_data/cn/max_compression_chunk.npy')
test(path_)
subprocess.run(['rm', '-rf', path_])
import gc
import os
import subprocess
import time
import memray
import numpy as np
import numpy.lib.format
from datetime import datetime as dt
def read_npy_header(file_path):
with open(file_path, 'rb') as f:
version = np.lib.format.read_magic(f)
if version == (1, 0):
shape, fortran_order, dtype = np.lib.format.read_array_header_1_0(f)
elif version == (2, 0):
shape, fortran_order, dtype = np.lib.format.read_array_header_2_0(f)
else:
raise ValueError(f"Unsupported .npy file version: {version}")
return version, shape, fortran_order, dtype
def test(path_):
subprocess.run(['rm', '-rf', path_])
rng_ = np.random.default_rng(413284713)
size_ = [268435450, 268435451, 268435452, 268435453, 268435454, 268435455, 268435456, 268435457, 268435458, 268435459, 268435460]
print('START', dt.now())
time.sleep(1)
for i in size_:
print(i, dt.now())
m_ = rng_.random(size=i, dtype=np.float64)
np.save(path_, m_)
print('saved:', dt.now())
print('file size (GB):', os.path.getsize(path_) / 1024**3)
print('header:', read_npy_header(path_))
time.sleep(1)
print('before del', dt.now())
del m_
gc.collect()
print('after del', dt.now())
time.sleep(1)
print('start loading', dt.now())
m_ = np.load(path_)
print('loaded:', dt.now())
print('size (GB)', m_.nbytes / 1024**3)
time.sleep(1)
print('remove all', dt.now())
del m_
gc.collect()
subprocess.run(['rm', '-rf', path_])
print('removed:', dt.now(), end='\n\n\n')
time.sleep(1)
print('END', dt.now())
if __name__ == "__main__":
with memray.Tracker(
"memray_test_ln3.bin",
memory_interval_ms=0.1,
follow_fork=True,
native_traces=True
):
path_ = os.path.join('beegfs or cvfs location', 'test_data/max_compression_chunk.npy')
test(path_)
subprocess.run(['rm', '-rf', path_])
@fschwar4
Copy link
Author

fschwar4 commented May 1, 2025

BeeGFS

large_beegfs
steps_beegfs

@fschwar4
Copy link
Author

fschwar4 commented May 1, 2025

cvfs

large_cvfs
steps_cvfs

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment