Created
May 1, 2025 22:01
-
-
Save fschwar4/768f86cba465fd3a613334516b0192df to your computer and use it in GitHub Desktop.
Numpy overhead for array loading with different file systems?
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import gc | |
import os | |
import subprocess | |
import time | |
import memray | |
import numpy as np | |
import numpy.lib.format | |
from datetime import datetime as dt | |
def read_npy_header(file_path): | |
with open(file_path, 'rb') as f: | |
version = np.lib.format.read_magic(f) | |
if version == (1, 0): | |
shape, fortran_order, dtype = np.lib.format.read_array_header_1_0(f) | |
elif version == (2, 0): | |
shape, fortran_order, dtype = np.lib.format.read_array_header_2_0(f) | |
else: | |
raise ValueError(f"Unsupported .npy file version: {version}") | |
return version, shape, fortran_order, dtype | |
def test(path_): | |
subprocess.run(['rm', '-rf', path_]) | |
rng_ = np.random.default_rng(413284713) | |
pow_ = [26, 27, 28, 30, 31, 32, 33, 34, 35] # end at 32 for cvfs | |
print('START', dt.now()) | |
time.sleep(1) | |
for i in pow_: | |
print(i, dt.now()) | |
m_ = rng_.random(size=2**i, dtype=np.float64) | |
np.save(path_, m_) | |
print('saved:', dt.now()) | |
print('file size (GB):', os.path.getsize(path_) / 1024**3) | |
print('header:', read_npy_header(path_)) | |
time.sleep(1) | |
print('before del', dt.now()) | |
del m_ | |
gc.collect() | |
print('after del', dt.now()) | |
time.sleep(1) | |
print('start loading', dt.now()) | |
m_ = np.load(path_) | |
print('loaded:', dt.now()) | |
print('size (GB)', m_.nbytes / 1024**3) | |
time.sleep(1) | |
print('remove all', dt.now()) | |
del m_ | |
gc.collect() | |
subprocess.run(['rm', '-rf', path_]) | |
print('removed:', dt.now(), end='\n\n\n') | |
time.sleep(1) | |
print('END', dt.now()) | |
if __name__ == "__main__": | |
with memray.Tracker( | |
"memray_test_cn.bin", | |
memory_interval_ms=0.1, | |
follow_fork=True, | |
native_traces=True | |
): | |
path_ = os.path.join('beegfs or cvfs location', 'test_data/cn/max_compression_chunk.npy') | |
test(path_) | |
subprocess.run(['rm', '-rf', path_]) | |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import gc | |
import os | |
import subprocess | |
import time | |
import memray | |
import numpy as np | |
import numpy.lib.format | |
from datetime import datetime as dt | |
def read_npy_header(file_path): | |
with open(file_path, 'rb') as f: | |
version = np.lib.format.read_magic(f) | |
if version == (1, 0): | |
shape, fortran_order, dtype = np.lib.format.read_array_header_1_0(f) | |
elif version == (2, 0): | |
shape, fortran_order, dtype = np.lib.format.read_array_header_2_0(f) | |
else: | |
raise ValueError(f"Unsupported .npy file version: {version}") | |
return version, shape, fortran_order, dtype | |
def test(path_): | |
subprocess.run(['rm', '-rf', path_]) | |
rng_ = np.random.default_rng(413284713) | |
size_ = [268435450, 268435451, 268435452, 268435453, 268435454, 268435455, 268435456, 268435457, 268435458, 268435459, 268435460] | |
print('START', dt.now()) | |
time.sleep(1) | |
for i in size_: | |
print(i, dt.now()) | |
m_ = rng_.random(size=i, dtype=np.float64) | |
np.save(path_, m_) | |
print('saved:', dt.now()) | |
print('file size (GB):', os.path.getsize(path_) / 1024**3) | |
print('header:', read_npy_header(path_)) | |
time.sleep(1) | |
print('before del', dt.now()) | |
del m_ | |
gc.collect() | |
print('after del', dt.now()) | |
time.sleep(1) | |
print('start loading', dt.now()) | |
m_ = np.load(path_) | |
print('loaded:', dt.now()) | |
print('size (GB)', m_.nbytes / 1024**3) | |
time.sleep(1) | |
print('remove all', dt.now()) | |
del m_ | |
gc.collect() | |
subprocess.run(['rm', '-rf', path_]) | |
print('removed:', dt.now(), end='\n\n\n') | |
time.sleep(1) | |
print('END', dt.now()) | |
if __name__ == "__main__": | |
with memray.Tracker( | |
"memray_test_ln3.bin", | |
memory_interval_ms=0.1, | |
follow_fork=True, | |
native_traces=True | |
): | |
path_ = os.path.join('beegfs or cvfs location', 'test_data/max_compression_chunk.npy') | |
test(path_) | |
subprocess.run(['rm', '-rf', path_]) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
BeeGFS