Created
November 26, 2020 09:40
-
-
Save packmad/c7c79d60d36c43798344dd7a31b5bca1 to your computer and use it in GitHub Desktop.
Load all json files from an input folder using multiprocess
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import json | |
import sys | |
import time | |
from multiprocessing import Pool | |
from os.path import isdir, abspath, join | |
from typing import List, Dict | |
def os_listdir_json(folder: str) -> List[str]: | |
return [abspath(join(folder, f)) for f in os.listdir(folder) if f.endswith('.json')] | |
def read_json(file_path: str) -> Dict: | |
with open(file_path, 'r', encoding='utf-8') as in_file: | |
return json.load(in_file) | |
def load_dataset(src_folder: str, processes: int = None) -> List[Dict]: | |
if processes is None: | |
processes = os.cpu_count() | |
with Pool(processes) as pool: | |
start_time = time.time() | |
res = pool.map(read_json, os_listdir_json(src_folder)) | |
print(f'> Loaded {len(res)} files in {round(time.time() - start_time, 1)} sec using {processes} processes') | |
return res | |
if __name__ == "__main__": | |
if len(sys.argv) != 2: | |
sys.exit('Wrong number of args') | |
assert isdir(sys.argv[1]) | |
src_folder = sys.argv[1] | |
for cpu_i in range(2, os.cpu_count()+1): | |
load_dataset(src_folder, cpu_i) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment