Created
December 9, 2016 17:44
-
-
Save Spindel/7b3c2cc7ee691bed97aac0a300a99f33 to your computer and use it in GitHub Desktop.
Direct IO in Python
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/env python3 | |
import os | |
import mmap | |
import logging | |
import hashlib | |
import contextlib | |
log = logging.getLogger(__name__) | |
@contextlib.contextmanager | |
def directio_mmap(filename, readsize, offset): | |
"""Context manager that returns an READ_ONLY O_DIRECT mmap for filename | |
Takes a desired readsize, but reduces that to the size of the device to | |
prevent errors. | |
""" | |
if offset % mmap.PAGESIZE: | |
raise ValueError("Offset must be a multiple of {}" | |
.format(mmap.PAGESIZE)) | |
# O_DIRECT means we bypass the file cache | |
fd = os.open(filename, os.O_RDONLY | os.O_DIRECT) | |
try: | |
# ACCESS_READ is necessary since RDONLY is set | |
with mmap.mmap(fd, readsize, offset=offset, | |
access=mmap.ACCESS_READ) as mm: | |
yield mm | |
finally: | |
os.close(fd) | |
def hashfile(filename, size): | |
"""Hashes a file and returns a hash object""" | |
hash = hashlib.sha256() | |
offset = 0 | |
if size == 0: | |
size = float('Inf') | |
# Find max_size so we don't try to read too much | |
with open(filename, 'rb') as f: | |
filesize = f.seek(0, 2) | |
size = min(size, filesize) | |
while offset < size: | |
blocksize = min(size - offset, 2**20) | |
with directio_mmap(filename, blocksize, offset) as mm: | |
offset += len(mm) | |
if not mm: | |
break | |
hash.update(mm) | |
log.debug("%s: sha256=%s, %s bytes", filename, hash.hexdigest(), offset) | |
return hash |
@Spindel I'm a bit confused: https://stackoverflow.com/a/19934213
I think this only work on Windows, isn't it?
Hello, @Spindel !
What do you think about this other solution:
Direct IO in Python
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
This shows how to do efficient hashing of (large) files in python, by also bypassing the filecache using O_DIRECT mode