jbbarth · June 17, 2015 15:07
diff --git a/test.txt b/test.txt


 1
 12
 123
 1234
 12345
 123456
 1234567
 12345678
 123456789
 1234567890
 12345678901
 123456789012
 1234567890123
 12345678901234
 123456789012345
 1234567890123456
 12345678901234567
 123456789012345678
 1234567890123456789
 12345678901234567890
 123456789012345678901
 1234567890123456789012
 12345678901234567890123
 123456789012345678901234
 1234567890123456789012345
 12345678901234567890123456
 123456789012345678901234567
 1234567890123456789012345678
 12345678901234567890123456789
 123456789012345678901234567890
 1234567890123456789012345678901
 12345678901234567890123456789012
 123456789012345678901234567890123
 1234567890123456789012345678901234
 12345678901234567890123456789012345
 123456789012345678901234567890123456
 1234567890123456789012345678901234567
 12345678901234567890123456789012345678
diff --git a/truncator.py b/truncator.py
 #!/usr/bin/env python

 def truncated_lines(file_handle, chunk_size=8092):
    """
    This function reads a file, truncating each line to a certain
    length. By using only reads of a specific chunk size, it
    should be able to read big files and yield only correct results
    as they come up.
    """
    # here we will store the rest of the string if the previous
    # chunk left us in the middle of a valid line
    rest = ""
    # boolean marking if we're at the end of a too long line or
    # not ; if so, the next beginning of chunk should be discarded
    # and the "rest" too
    discard_next = False
    while True:
        # here we read "<chunk size> - <rest length>", else
        # we could have a (2 * chunk_size - 1) string to reason
        # about and the following steps would be a pain ;
        # adjusting the chunk size is much more easy when we have
        # a rest to reason about
        # NB: the algorithm ensures len(rest) < chunk_size
        data = file_handle.read(chunk_size - len(rest))
        if not data:
            break
        lines = (rest + data).split("\n")
        cut_case = len(lines) == 1
        # discard first item if end of cut string
        if discard_next:
            lines.pop(0)
        # if "cut" case, next items should be discarded
        # and rest is set to empty string
        if cut_case:
            lines.append('')
            discard_next = True
        else:
            discard_next = False
        # yield all but last, which can continue on next chunk
        for line in lines[0:-1]:
            yield line
        rest = lines[-1]

 if __name__ == "__main__":
    LENGTH = 15
    # only an example... can be compared side by side with "cat test.txt"
    # (beware of visual comparisons on other files, some may contain tabs)
    counter = 0
    with open("test.txt") as f:
        for line in truncated_lines(f, LENGTH):
            assert line.startswith("1") or line == ""
            assert len(line) <= LENGTH
            counter += 1
            print line
    assert counter == 40


	1
	12
	123
	1234
	12345
	123456
	1234567
	12345678
	123456789
	1234567890
	12345678901
	123456789012
	1234567890123
	12345678901234
	123456789012345
	1234567890123456
	12345678901234567
	123456789012345678
	1234567890123456789
	12345678901234567890
	123456789012345678901
	1234567890123456789012
	12345678901234567890123
	123456789012345678901234
	1234567890123456789012345
	12345678901234567890123456
	123456789012345678901234567
	1234567890123456789012345678
	12345678901234567890123456789
	123456789012345678901234567890
	1234567890123456789012345678901
	12345678901234567890123456789012
	123456789012345678901234567890123
	1234567890123456789012345678901234
	12345678901234567890123456789012345
	123456789012345678901234567890123456
	1234567890123456789012345678901234567
	12345678901234567890123456789012345678
	#!/usr/bin/env python

	def truncated_lines(file_handle, chunk_size=8092):
	"""
	This function reads a file, truncating each line to a certain
	length. By using only reads of a specific chunk size, it
	should be able to read big files and yield only correct results
	as they come up.
	"""
	# here we will store the rest of the string if the previous
	# chunk left us in the middle of a valid line
	rest = ""
	# boolean marking if we're at the end of a too long line or
	# not ; if so, the next beginning of chunk should be discarded
	# and the "rest" too
	discard_next = False
	while True:
	# here we read "<chunk size> - <rest length>", else
	# we could have a (2 * chunk_size - 1) string to reason
	# about and the following steps would be a pain ;
	# adjusting the chunk size is much more easy when we have
	# a rest to reason about
	# NB: the algorithm ensures len(rest) < chunk_size
	data = file_handle.read(chunk_size - len(rest))
	if not data:
	break
	lines = (rest + data).split("\n")
	cut_case = len(lines) == 1
	# discard first item if end of cut string
	if discard_next:
	lines.pop(0)
	# if "cut" case, next items should be discarded
	# and rest is set to empty string
	if cut_case:
	lines.append('')
	discard_next = True
	else:
	discard_next = False
	# yield all but last, which can continue on next chunk
	for line in lines[0:-1]:
	yield line
	rest = lines[-1]

	if __name__ == "__main__":
	LENGTH = 15
	# only an example... can be compared side by side with "cat test.txt"
	# (beware of visual comparisons on other files, some may contain tabs)
	counter = 0
	with open("test.txt") as f:
	for line in truncated_lines(f, LENGTH):
	assert line.startswith("1") or line == ""
	assert len(line) <= LENGTH
	counter += 1
	print line
	assert counter == 40