Skip to content

Instantly share code, notes, and snippets.

@danielhavir
Created November 21, 2017 13:47
Show Gist options
  • Save danielhavir/a61a68a84f61eb1504aaa7f2463b20c0 to your computer and use it in GitHub Desktop.
Save danielhavir/a61a68a84f61eb1504aaa7f2463b20c0 to your computer and use it in GitHub Desktop.
import bson # pip install pymongo
import numpy as np
import pandas as pd
import os
from tqdm import tqdm_notebook
out_folder = 'images'
# Create output folder
if not os.path.exists(out_folder):
os.makedirs(out_folder)
# Create categories folders
categories = pd.read_csv('data/category_names.csv', index_col='category_id')
for category in tqdm_notebook(categories.index):
os.mkdir(os.path.join(out_folder, str(category)))
num_products = 7069896 # 7069896 for train and 1768182 for test
bar = tqdm_notebook(total=num_products)
with open('data/train.bson', 'rb') as fbson:
data = bson.decode_file_iter(fbson)
for c, d in enumerate(data):
category = d['category_id']
_id = d['_id']
for e, pic in enumerate(d['imgs']):
fname = os.path.join(out_folder, str(category), '{}-{}.jpg'.format(_id, e))
with open(fname, 'wb') as f:
f.write(pic['picture'])
bar.update()
print('Done')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment