Created
June 17, 2015 15:07
-
-
Save jbbarth/584e791d90fd7eb78b28 to your computer and use it in GitHub Desktop.
Big files truncate lines
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
1 | |
12 | |
123 | |
1234 | |
12345 | |
123456 | |
1234567 | |
12345678 | |
123456789 | |
1234567890 | |
12345678901 | |
123456789012 | |
1234567890123 | |
12345678901234 | |
123456789012345 | |
1234567890123456 | |
12345678901234567 | |
123456789012345678 | |
1234567890123456789 | |
12345678901234567890 | |
123456789012345678901 | |
1234567890123456789012 | |
12345678901234567890123 | |
123456789012345678901234 | |
1234567890123456789012345 | |
12345678901234567890123456 | |
123456789012345678901234567 | |
1234567890123456789012345678 | |
12345678901234567890123456789 | |
123456789012345678901234567890 | |
1234567890123456789012345678901 | |
12345678901234567890123456789012 | |
123456789012345678901234567890123 | |
1234567890123456789012345678901234 | |
12345678901234567890123456789012345 | |
123456789012345678901234567890123456 | |
1234567890123456789012345678901234567 | |
12345678901234567890123456789012345678 |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
def truncated_lines(file_handle, chunk_size=8092): | |
""" | |
This function reads a file, truncating each line to a certain | |
length. By using only reads of a specific chunk size, it | |
should be able to read big files and yield only correct results | |
as they come up. | |
""" | |
# here we will store the rest of the string if the previous | |
# chunk left us in the middle of a valid line | |
rest = "" | |
# boolean marking if we're at the end of a too long line or | |
# not ; if so, the next beginning of chunk should be discarded | |
# and the "rest" too | |
discard_next = False | |
while True: | |
# here we read "<chunk size> - <rest length>", else | |
# we could have a (2 * chunk_size - 1) string to reason | |
# about and the following steps would be a pain ; | |
# adjusting the chunk size is much more easy when we have | |
# a rest to reason about | |
# NB: the algorithm ensures len(rest) < chunk_size | |
data = file_handle.read(chunk_size - len(rest)) | |
if not data: | |
break | |
lines = (rest + data).split("\n") | |
cut_case = len(lines) == 1 | |
# discard first item if end of cut string | |
if discard_next: | |
lines.pop(0) | |
# if "cut" case, next items should be discarded | |
# and rest is set to empty string | |
if cut_case: | |
lines.append('') | |
discard_next = True | |
else: | |
discard_next = False | |
# yield all but last, which can continue on next chunk | |
for line in lines[0:-1]: | |
yield line | |
rest = lines[-1] | |
if __name__ == "__main__": | |
LENGTH = 15 | |
# only an example... can be compared side by side with "cat test.txt" | |
# (beware of visual comparisons on other files, some may contain tabs) | |
counter = 0 | |
with open("test.txt") as f: | |
for line in truncated_lines(f, LENGTH): | |
assert line.startswith("1") or line == "" | |
assert len(line) <= LENGTH | |
counter += 1 | |
print line | |
assert counter == 40 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment