Created
January 12, 2017 18:30
-
-
Save Coderx7/b43a206b20ff4765a1b28fe997f1137c to your computer and use it in GitHub Desktop.
CIfar10-lmdb-zeropad-normalize script for caffe
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#in the name of God, the most compassionate the most merciful | |
#Seyyed Hossein Hasanpour | |
#[email protected] | |
#script for zeropadding and normalizing CIFAR10 dataset (can also be used for CIFAR100) | |
import math | |
import caffe | |
import lmdb | |
import numpy as np | |
from caffe.proto import caffe_pb2 | |
import cv2 | |
import sys | |
########## mean-std code ############ | |
#use this for large datasets, where numpy.mean() fails | |
#since all of the dataset can not fit in the memory | |
def mean_semi_vectorized(a): | |
#sum all elements in each channel and divide by the number of elements | |
batch = a.shape[0] | |
channel = a.shape[1] | |
width = a.shape[2] | |
height = a.shape[3] | |
sum = np.zeros((channel)) | |
for i in range(batch): | |
for j in range(channel): | |
sum[j] += np.sum(a[i,j,:,:]) | |
return (sum/(width*height*batch)) | |
#semi-vectorized, very fast. | |
#use this if you face memory shortage errors because of your dataset being | |
#too big for your memory | |
def std_semi_vectorized(a): | |
batch = a.shape[0] | |
channel = a.shape[1] | |
width = a.shape[2] | |
height = a.shape[3] | |
mean = mean_semi_vectorized(a) | |
sum = np.zeros((channel)) | |
for i in range(batch): | |
for j in range(channel): | |
sum[j] += np.sum(abs(a[i,j,:,:] - mean[j])**2) | |
var = (sum/(width*height*batch)) | |
std = [round(math.sqrt(x),8) for x in var ] | |
return std | |
########## Actual Code ############ | |
db_train = lmdb.open('cifar10_train_lmdb') | |
db_train_txn = db_train.begin(write=True) | |
db_test = lmdb.open('cifar10_test_lmdb') | |
db_test_txn = db_test.begin(write=True) | |
datum = caffe_pb2.Datum() | |
index = sys.argv[0] | |
size_train = 50000 | |
size_test = 10000 | |
data_train = np.zeros((size_train, 3, 32, 32)) | |
label_train = np.zeros(size_train, dtype=int) | |
data_test = np.zeros((size_test, 3, 32, 32)) | |
label_test = np.zeros(size_test, dtype=int) | |
print 'Reading training data...' | |
i = -1 | |
for key, value in db_train_txn.cursor(): | |
i = i + 1 | |
if i % 1000 == 0: | |
print i | |
if i == size_train: | |
break | |
datum.ParseFromString(value) | |
label = datum.label | |
data = caffe.io.datum_to_array(datum) | |
data_train[i] = data | |
label_train[i] = label | |
print 'Reading test data...' | |
i = -1 | |
for key, value in db_test_txn.cursor(): | |
i = i + 1 | |
if i % 1000 == 0: | |
print i | |
if i ==size_test: | |
break | |
datum.ParseFromString(value) | |
label = datum.label | |
data = caffe.io.datum_to_array(datum) | |
data_test[i] = data | |
label_test[i] = label | |
print 'Computing statistics...' | |
mean = np.mean(data_train, axis=(0,2,3)) | |
std = np.std(data_train, axis=(0,2,3)) | |
print mean | |
print std | |
print mean.shape | |
print std.shape | |
#np.savetxt('mean_mnist.txt', mean) | |
#np.savetxt('std_mnist.txt', std) | |
print 'Normalizing' | |
for i in range(3): | |
print i | |
data_train[:, i, :, :] = data_train[:, i, :, :] - mean[i] | |
data_train[:, i, :, :] = data_train[:, i, :, :]/std[i] | |
data_test[:, i, :, :] = data_test[:, i, :, :] - mean[i] | |
data_test[:, i, :, :] = data_test[:, i, :, :]/std[i] | |
#Zero Padding | |
#print 'Padding...' | |
npad = ((0,0), (0,0), (4,4), (4,4)) | |
data_train = np.pad(data_train, pad_width=npad, mode='constant', constant_values=0) | |
data_test = np.pad(data_test, pad_width=npad, mode='constant', constant_values=0) | |
print 'Outputting training data' | |
lmdb_file ='cifar10_train_lmdb_norm2' | |
batch_size = size_train | |
db = lmdb.open(lmdb_file, map_size=int(data_train.nbytes)) | |
batch = db.begin(write=True) | |
datum = caffe_pb2.Datum() | |
for i in range(size_train): | |
if i % 1000 == 0: | |
print i | |
# save in datum | |
datum = caffe.io.array_to_datum(data_train[i], label_train[i]) | |
keystr = '{:0>5d}'.format(i) | |
batch.put( keystr, datum.SerializeToString() ) | |
# write batch | |
if(i + 1) % batch_size == 0: | |
batch.commit() | |
batch=db.begin(write=True) | |
print (i + 1) | |
# write last batch | |
if (i+1) % batch_size != 0: | |
batch.commit() | |
print 'last batch' | |
print (i + 1) | |
print 'Outputting test data' | |
lmdb_file = 'cifar10_test_lmdb_norm2' | |
batch_size = size_test | |
db = lmdb.open(lmdb_file,map_size=int(data_test.nbytes)) | |
batch = db.begin(write=True) | |
datum = caffe_pb2.Datum() | |
for i in range(size_test): | |
# save in datum | |
datum = caffe.io.array_to_datum(data_test[i], label_test[i]) | |
keystr = '{:0>5d}'.format(i) | |
batch.put( keystr, datum.SerializeToString() ) | |
# write batch | |
if(i + 1) % batch_size == 0: | |
batch.commit() | |
batch = db.begin(write=True) | |
print (i + 1) | |
# write last batch | |
if (i+1) % batch_size != 0: | |
batch.commit() | |
print 'last batch' | |
print (i + 1) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment