Last active
May 29, 2024 11:19
-
-
Save iyvinjose/e6c1cb2821abd5f01fd1b9065cbc759d to your computer and use it in GitHub Desktop.
Read large files line by line without loading entire file to memory. Supports files of GB size
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def read_lines_from_file_as_data_chunks(file_name, chunk_size, callback, return_whole_chunk=False): | |
""" | |
read file line by line regardless of its size | |
:param file_name: absolute path of file to read | |
:param chunk_size: size of data to be read at at time | |
:param callback: callback method, prototype ----> def callback(data, eof, file_name) | |
:return: | |
""" | |
def read_in_chunks(file_obj, chunk_size=5000): | |
""" | |
https://stackoverflow.com/a/519653/5130720 | |
Lazy function to read a file | |
Default chunk size: 5000. | |
""" | |
while True: | |
data = file_obj.read(chunk_size) | |
if not data: | |
break | |
yield data | |
fp = open(file_name) | |
data_left_over = None | |
# loop through characters | |
for chunk in read_in_chunks(fp): | |
# if uncompleted data exists | |
if data_left_over: | |
# print('\n left over found') | |
current_chunk = data_left_over + chunk | |
else: | |
current_chunk = chunk | |
# split chunk by new line | |
lines = current_chunk.splitlines() | |
# check if line is complete | |
if current_chunk.endswith('\n'): | |
data_left_over = None | |
else: | |
data_left_over = lines.pop() | |
if return_whole_chunk: | |
callback(data=lines, eof=False, file_name=file_name) | |
else: | |
for line in lines: | |
callback(data=line, eof=False, file_name=file_name) | |
pass | |
if data_left_over: | |
current_chunk = data_left_over | |
if current_chunk is not None: | |
lines = current_chunk.splitlines() | |
if return_whole_chunk: | |
callback(data=lines, eof=False, file_name=file_name) | |
else : | |
for line in lines: | |
callback(data=line, eof=False, file_name=file_name) | |
pass | |
callback(data=None, eof=True, file_name=file_name) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import data_loading_utils.py | |
file_name = 'file_name.ext' | |
CHUNK_SIZE = 1000000 # configure this variable depending on your machine's hardware configuration | |
# callback method | |
def process_lines(data, eof, file_name): | |
# check if end of file reached | |
if not eof: | |
# process data, data is one single line of the file | |
else: | |
# end of file reached | |
if __name__ == "__main__": | |
data_loading_utils.read_lines_from_file_as_data_chunks(file_name, chunk_size=CHUNK_SIZE, callback=self.process_lines) | |
# process_lines method is the callback method. | |
# It will be called for all the lines, with parameter data representing one single line of the file at a time |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Great and very fast implementation!
Thank you!