iyvinjose · March 27, 2025 14:12 · akushyn · Dec 29, 2019 · JorgeMadson · Mar 27, 2025
diff --git a/data_loading_utils.py b/data_loading_utils.py
 def read_lines_from_file_as_data_chunks(file_name, chunk_size, callback, return_whole_chunk=False):
    """
    read file line by line regardless of its size
    :param file_name: absolute path of file to read
    :param chunk_size: size of data to be read at at time
    :param callback: callback method, prototype ----> def callback(data, eof, file_name)
    :return:
    """

    def read_in_chunks(file_obj, chunk_size=5000):
        """
        https://stackoverflow.com/a/519653/5130720
        Lazy function to read a file 
        Default chunk size: 5000.
        """
        while True:
            data = file_obj.read(chunk_size)
            if not data:
                break
            yield data

    fp = open(file_name)
    data_left_over = None

    # loop through characters
    for chunk in read_in_chunks(fp):


        # if uncompleted data exists
        if data_left_over:
            # print('\n left over found')
            current_chunk = data_left_over + chunk
        else:
            current_chunk = chunk

        # split chunk by new line
        lines = current_chunk.splitlines()

        # check if line is complete
        if current_chunk.endswith('\n'):
            data_left_over = None

        else:
            data_left_over = lines.pop()

        if return_whole_chunk:
            callback(data=lines, eof=False, file_name=file_name)

        else:

            for line in lines:
                callback(data=line, eof=False, file_name=file_name)
                pass

    if data_left_over:

        current_chunk = data_left_over
        if current_chunk is not None:

            lines = current_chunk.splitlines()

            if return_whole_chunk:
                callback(data=lines, eof=False, file_name=file_name)

            else :
                for line in lines:
                    callback(data=line, eof=False, file_name=file_name)
                    pass

    callback(data=None, eof=True, file_name=file_name)
diff --git a/main.py b/main.py
 import data_loading_utils.py

 file_name = 'file_name.ext'
 CHUNK_SIZE = 1000000 # configure this variable depending on your machine's hardware configuration

 # callback method
 def process_lines(data, eof, file_name):

    # check if end of file reached
    if not eof:
         # process data, data is one single line of the file

    else:
         # end of file reached

        
 if __name__ == "__main__":
    data_loading_utils.read_lines_from_file_as_data_chunks(file_name, chunk_size=CHUNK_SIZE, callback=self.process_lines)

    # process_lines method is the callback method. 
    # It will be called for all the lines, with parameter data representing one single line of the file at a time
	def read_lines_from_file_as_data_chunks(file_name, chunk_size, callback, return_whole_chunk=False):
	"""
	read file line by line regardless of its size
	:param file_name: absolute path of file to read
	:param chunk_size: size of data to be read at at time
	:param callback: callback method, prototype ----> def callback(data, eof, file_name)
	:return:
	"""

	def read_in_chunks(file_obj, chunk_size=5000):
	"""
	https://stackoverflow.com/a/519653/5130720
	Lazy function to read a file
	Default chunk size: 5000.
	"""
	while True:
	data = file_obj.read(chunk_size)
	if not data:
	break
	yield data

	fp = open(file_name)
	data_left_over = None

	# loop through characters
	for chunk in read_in_chunks(fp):


	# if uncompleted data exists
	if data_left_over:
	# print('\n left over found')
	current_chunk = data_left_over + chunk
	else:
	current_chunk = chunk

	# split chunk by new line
	lines = current_chunk.splitlines()

	# check if line is complete
	if current_chunk.endswith('\n'):
	data_left_over = None

	else:
	data_left_over = lines.pop()

	if return_whole_chunk:
	callback(data=lines, eof=False, file_name=file_name)

	else:

	for line in lines:
	callback(data=line, eof=False, file_name=file_name)
	pass

	if data_left_over:

	current_chunk = data_left_over
	if current_chunk is not None:

	lines = current_chunk.splitlines()

	if return_whole_chunk:
	callback(data=lines, eof=False, file_name=file_name)

	else :
	for line in lines:
	callback(data=line, eof=False, file_name=file_name)
	pass

	callback(data=None, eof=True, file_name=file_name)
	import data_loading_utils.py

	file_name = 'file_name.ext'
	CHUNK_SIZE = 1000000 # configure this variable depending on your machine's hardware configuration

	# callback method
	def process_lines(data, eof, file_name):

	# check if end of file reached
	if not eof:
	# process data, data is one single line of the file

	else:
	# end of file reached


	if __name__ == "__main__":
	data_loading_utils.read_lines_from_file_as_data_chunks(file_name, chunk_size=CHUNK_SIZE, callback=self.process_lines)

	# process_lines method is the callback method.
	# It will be called for all the lines, with parameter data representing one single line of the file at a time