salma71 · February 8, 2021 13:57
diff --git a/read_boost b/read_boost
 import pandas as pd
 import multiprocessing as mp

 LARGE_FILE = "D:\\my_large_file.txt"
 CHUNKSIZE = 100000 # processing 100,000 rows at a time

 def process_frame(df):
        # process data frame
        return len(df)

 if __name__ == '__main__':
        reader = pd.read_table(LARGE_FILE, chunksize=CHUNKSIZE)
        pool = mp.Pool(4) # use 4 processes

        funclist = []
        for df in reader:
                # process each data frame
                f = pool.apply_async(process_frame,[df])
                funclist.append(f)

        result = 0
        for f in funclist:
                result += f.get(timeout=10) # timeout in 10 seconds

        print "There are %d rows of data"%(result)
diff --git a/read_file b/read_file
 import pandas as pd

 LARGE_FILE = "D:\\my_large_file.txt"
 CHUNKSIZE = 100000 # processing 100,000 rows at a time

 def process_frame(df):
        # process data frame
        return len(df)

 if __name__ == '__main__':
        reader = pd.read_table(LARGE_FILE, chunksize=CHUNKSIZE)

        result = 0
        for df in reader:
                # process each data frame
                result += process_frame(df)

        print "There are %d rows of data"%(result)
	import pandas as pd
	import multiprocessing as mp

	LARGE_FILE = "D:\\my_large_file.txt"
	CHUNKSIZE = 100000 # processing 100,000 rows at a time

	def process_frame(df):
	# process data frame
	return len(df)

	if __name__ == '__main__':
	reader = pd.read_table(LARGE_FILE, chunksize=CHUNKSIZE)
	pool = mp.Pool(4) # use 4 processes

	funclist = []
	for df in reader:
	# process each data frame
	f = pool.apply_async(process_frame,[df])
	funclist.append(f)

	result = 0
	for f in funclist:
	result += f.get(timeout=10) # timeout in 10 seconds

	print "There are %d rows of data"%(result)