Skip to content

Instantly share code, notes, and snippets.

@jbbarth
Created June 17, 2015 15:07
Show Gist options
  • Save jbbarth/584e791d90fd7eb78b28 to your computer and use it in GitHub Desktop.
Save jbbarth/584e791d90fd7eb78b28 to your computer and use it in GitHub Desktop.
Big files truncate lines
1
12
123
1234
12345
123456
1234567
12345678
123456789
1234567890
12345678901
123456789012
1234567890123
12345678901234
123456789012345
1234567890123456
12345678901234567
123456789012345678
1234567890123456789
12345678901234567890
123456789012345678901
1234567890123456789012
12345678901234567890123
123456789012345678901234
1234567890123456789012345
12345678901234567890123456
123456789012345678901234567
1234567890123456789012345678
12345678901234567890123456789
123456789012345678901234567890
1234567890123456789012345678901
12345678901234567890123456789012
123456789012345678901234567890123
1234567890123456789012345678901234
12345678901234567890123456789012345
123456789012345678901234567890123456
1234567890123456789012345678901234567
12345678901234567890123456789012345678
#!/usr/bin/env python
def truncated_lines(file_handle, chunk_size=8092):
"""
This function reads a file, truncating each line to a certain
length. By using only reads of a specific chunk size, it
should be able to read big files and yield only correct results
as they come up.
"""
# here we will store the rest of the string if the previous
# chunk left us in the middle of a valid line
rest = ""
# boolean marking if we're at the end of a too long line or
# not ; if so, the next beginning of chunk should be discarded
# and the "rest" too
discard_next = False
while True:
# here we read "<chunk size> - <rest length>", else
# we could have a (2 * chunk_size - 1) string to reason
# about and the following steps would be a pain ;
# adjusting the chunk size is much more easy when we have
# a rest to reason about
# NB: the algorithm ensures len(rest) < chunk_size
data = file_handle.read(chunk_size - len(rest))
if not data:
break
lines = (rest + data).split("\n")
cut_case = len(lines) == 1
# discard first item if end of cut string
if discard_next:
lines.pop(0)
# if "cut" case, next items should be discarded
# and rest is set to empty string
if cut_case:
lines.append('')
discard_next = True
else:
discard_next = False
# yield all but last, which can continue on next chunk
for line in lines[0:-1]:
yield line
rest = lines[-1]
if __name__ == "__main__":
LENGTH = 15
# only an example... can be compared side by side with "cat test.txt"
# (beware of visual comparisons on other files, some may contain tabs)
counter = 0
with open("test.txt") as f:
for line in truncated_lines(f, LENGTH):
assert line.startswith("1") or line == ""
assert len(line) <= LENGTH
counter += 1
print line
assert counter == 40
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment