Skip to content

Instantly share code, notes, and snippets.

@WillKoehrsen
Created September 23, 2018 02:08
Show Gist options
  • Save WillKoehrsen/a3aa94b49e984e394d3d7e51b341a729 to your computer and use it in GitHub Desktop.
Save WillKoehrsen/a3aa94b49e984e394d3d7e51b341a729 to your computer and use it in GitHub Desktop.
import json
from multiprocessing.dummy import Pool as Threadpool
from itertools import chain
def read_data(file_path):
"""Read in json data from `file_path`"""
data = []
# Open the file and load in json
with open(file_path, 'r') as fin:
for l in fin.readlines():
data.append(json.loads(l))
return data
# List of files to read in
saved_files = ['/data/wiki/partitions/' + x for x in os.listdir('/data/wiki/partitions/')]
# Create a threadpool for reading in files
threadpool = Threadpool(processes = 10)
# Read in the files as a list of lists
results = threadpool.map(read_data, saved_files)
# Flatten the list of lists to a single list
book_list = list(chain(*results))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment