Last active
October 25, 2025 13:55
-
-
Save xflr6/49a7be702939b8467310c42fdc44643e to your computer and use it in GitHub Desktop.
Compare while-loop with break to for-loop with two-argument iter() for iterating over a large file in chunks
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| """Compare four ways to iterate over a large binary file in chunks.""" | |
| import functools | |
| import hashlib | |
| import mmap | |
| import os | |
| import pathlib | |
| import shutil | |
| import time | |
| import types | |
| __all__ = ['shasum256_file_digest', | |
| 'sha256sum_while', | |
| 'sha256sum_iter', | |
| 'sha256sum_copyfileobj', | |
| 'sha256sum_mmap'] | |
| PATH = pathlib.Path('spam.bin') | |
| SIZE = 107_3741_824 | |
| CHUNK = 1_048_576 | |
| EXPECTED_RESULT = '49bc20df15e412a64472421e13fe86ff1c5165e18b2afccf160d4dc19fe68a14' | |
| def shasum256_file_digest(path: os.PathLike[str] | str, /) -> str: | |
| with path.open('rb') as f: | |
| s = hashlib.file_digest(f, hashlib.sha256) | |
| return s.hexdigest() | |
| def sha256sum_while(path: os.PathLike[str] | str, /, *, | |
| bufsize: int = 32_768) -> str: | |
| s = hashlib.sha256() | |
| with open(path, mode='rb') as f: | |
| while True: | |
| data = f.read(bufsize) | |
| if not data: | |
| break | |
| s.update(data) | |
| return s.hexdigest() | |
| def sha256sum_iter(path: os.PathLike[str] | str, /, *, | |
| bufsize: int = 32_768) -> str: | |
| s = hashlib.sha256() | |
| with open(path, mode='rb') as f: | |
| for data in iter(functools.partial(f.read, bufsize), b''): | |
| s.update(data) | |
| return s.hexdigest() | |
| def sha256sum_copyfileobj(path: os.PathLike[str] | str, /, *, | |
| bufsize: int = 32_768) -> str: | |
| s = hashlib.sha256() | |
| dest = types.SimpleNamespace(write=s.update) | |
| with open(path, mode='rb') as f: | |
| shutil.copyfileobj(f, dest, length=bufsize) | |
| return s.hexdigest() | |
| def sha256sum_mmap(path: os.PathLike[str] | str, /) -> str: | |
| # poor performance under Python 3? | |
| s = hashlib.sha256() | |
| with (open(path, 'rb')as f, | |
| mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_READ) as data): | |
| s.update(data) | |
| return s.hexdigest() | |
| if not PATH.exists(): | |
| with open(PATH, mode='wb') as f: | |
| zeros = bytes(CHUNK) | |
| for _ in range(SIZE // CHUNK): | |
| f.write(zeros) | |
| for func_name in __all__: | |
| print(func_name) | |
| func = eval(func_name) | |
| print(func) | |
| start = time.perf_counter_ns() | |
| result = func(PATH) | |
| duration = (time.perf_counter_ns() - start) / 1_000_000_000 | |
| print('result:', 'OK' if result == EXPECTED_RESULT else result) | |
| print('duration:', duration) | |
| print() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment