Created
March 4, 2021 01:25
-
-
Save neerajvashistha/2f2966ca08c717340ab142eccc9685d0 to your computer and use it in GitHub Desktop.
Simple multi processing based on list
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import glob | |
from multiprocessing import Pool, freeze_support | |
def multicore_function(function_name, alist, wfname): | |
''' | |
this function accept 3 arguments | |
1.function name: name of that function which will be called. | |
2.wfname: name of file which will be written. | |
3.alist: list of data which we breaks in four parts and assigned to different core. | |
''' | |
freeze_support() | |
def chunkify(wfname, lst, n): | |
''' | |
wfname is the output file name in which the content of 'n' lst(list) of files will be written | |
''' | |
rt = [] | |
rt_lst = [lst[i::n] for i in range(n)] | |
for i in range(n): | |
if rt_lst[i]: | |
rt.append([wfname+'_'+str(i)+'.txt']+[rt_lst[i]]) | |
return rt | |
#create a process Pool with 4 processes | |
parts = (part for part in chunkify(wfname, alist, 1000)) | |
#map doWork to availble Pool processes | |
with Pool(processes=4) as pool: | |
pool.map(function_name, parts) | |
def process_pdf_list(alisted_file): | |
''' | |
this function will take n files in one go from list, alisted_file. | |
alisted_file = ['out_filename_0.txt',['in_file1', ... ,'in_file100']] | |
''' | |
# Initialise the output file | |
out_file = alisted_file[0] | |
listed = alisted_file[1] | |
with open(out_file, 'w', encoding="utf-8") as fw: | |
for file_name in listed: | |
try: | |
text = converter.pdf_reader(file_name) | |
line = line_creator_new(text) | |
fw.write(bytes(line, 'utf-8').decode('utf-8') + '\n') | |
except: | |
print("error in reading" + file_name) | |
logging.info("N files processed and written to" + out_file + " ") | |
def main(): | |
list_files = glob.glob('*.pdf') | |
multicore_function(process_pdf_list, list_files, "out_filename") | |
if __name__ == '__main__': | |
main() |
Author
neerajvashistha
commented
Jun 29, 2021
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment