Last active
August 29, 2015 14:26
-
-
Save andreasvc/194869bffb690cf26714 to your computer and use it in GitHub Desktop.
Benchmark of indexing of line offsets in text file.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
"""Benchmark of indexing of line offsets in text file. | |
Usage example: | |
>>> index = indexfile_iter('1027.txt') | |
>>> index[5] | |
115 | |
>>> import bisect | |
>>> bisect.bisect(index, 115) - 1 | |
5 | |
Conclusion: mmap doesn't matter, indexfile_iter is fastest. | |
In [1]: import lineidx | |
In [2]: %timeit lineidx.indexfile_iter('1027.txt') | |
100 loops, best of 3: 11.7 ms per loop | |
In [3]: %timeit lineidx.indexfile_re('1027.txt') | |
10 loops, best of 3: 75 ms per loop | |
In [4]: %timeit lineidx.indexfile_re_nommap('1027.txt') | |
10 loops, best of 3: 76.1 ms per loop | |
In [5]: %timeit lineidx.indexfile_re2('1027.txt') | |
10 loops, best of 3: 27.8 ms per loop | |
""" | |
import re | |
import re2 | |
import mmap | |
import array | |
NONEMPTPTYLINE = re.compile(br'[^ \t\n\r][ \t]*[\r\n]+') | |
NONEMPTPTYLINE2 = re2.compile(r'[^ \t\n\r][ \t]*[\r\n]+') | |
def indexfile_iter(filename): | |
"""Get bitmap with locations of non-empty lines.""" | |
result = array.array('I', []) | |
offset = 0 | |
with open(filename, 'rb') as tmp: | |
for line in tmp: | |
if not line.isspace(): | |
result.append(offset) | |
offset += len(line) | |
result.append(offset) | |
return result | |
def indexfile_re(filename): | |
"""Get bitmap with locations of non-empty lines.""" | |
result = array.array('I', [0]) | |
with open(filename, 'r+b') as tmp: | |
data = mmap.mmap(tmp.fileno(), 0, access=mmap.ACCESS_READ) | |
result.extend( | |
match.end() for match in NONEMPTPTYLINE.finditer(data)) | |
data.close() | |
return result | |
def indexfile_re_nommap(filename): | |
"""Get bitmap with locations of non-empty lines.""" | |
with open(filename, 'rb') as tmp: | |
data = tmp.read() | |
result = array.array('I', [0]) | |
result.extend( | |
match.end() for match in NONEMPTPTYLINE.finditer(data)) | |
return result | |
def indexfile_re2(filename): | |
"""Get bitmap with locations of non-empty lines.""" | |
with open(filename, 'rb') as tmp: | |
data = tmp.read() | |
result = array.array('I', [0]) | |
result.extend( | |
match.end() for match in NONEMPTPTYLINE2.finditer(data)) | |
return result |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment