Created
November 6, 2017 12:20
-
-
Save anonymous/e7d5873994a3c4bf2410268148c907ec to your computer and use it in GitHub Desktop.
Processing BSON Files (Kaggle cdiscount competition)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import io | |
import bson # this is installed with the pymongo package | |
import matplotlib.pyplot as plt | |
import multiprocessing as mp # will come in handy due to the size of the data | |
from process import process | |
NCORE = 6 | |
prod_to_category = mp.Manager().dict() # note the difference | |
q = mp.Queue(maxsize=NCORE) | |
iolock = mp.Lock() | |
pool = mp.Pool(NCORE, initializer=process, initargs=(q, iolock, prod_to_category)) | |
# process the file | |
data = bson.decode_file_iter(open(r'C:\nbs\cdiscount\data\train_example.bson', 'rb')) | |
for c, d in enumerate(data): | |
q.put(d) # blocks until q below its max size | |
# tell workers we're done | |
for _ in range(NCORE): | |
q.put(None) | |
pool.close() | |
pool.join() | |
# convert back to normal dictionary | |
prod_to_category = dict(prod_to_category) | |
prod_to_category = pd.DataFrame.from_dict(prod_to_category, orient='index') | |
prod_to_category.index.name = '_id' | |
prod_to_category.rename(columns={0: 'category_id'}, inplace=True) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from skimage.data import imread # or, whatever image library you prefer | |
import io | |
def process(q, iolock, prod_to_category): | |
while True: | |
d = q.get() | |
if d is None: | |
break | |
product_id = d['_id'] | |
category_id = d['category_id'] | |
prod_to_category[product_id] = category_id | |
for e, pic in enumerate(d['imgs']): | |
picture = imread(io.BytesIO(pic['picture'])) | |
# do something with the picture, etc |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment