-
-
Save ardamavi/a7d06ff8a315308771c70006cf494d69 to your computer and use it in GitHub Desktop.
# Arda Mavi | |
import os | |
import numpy as np | |
from os import listdir | |
from scipy.misc import imread, imresize | |
from keras.utils import to_categorical | |
from sklearn.model_selection import train_test_split | |
# Settings: | |
img_size = 64 | |
grayscale_images = True | |
num_class = 10 | |
test_size = 0.2 | |
def get_img(data_path): | |
# Getting image array from path: | |
img = imread(data_path, flatten=grayscale_images) | |
img = imresize(img, (img_size, img_size, 1 if grayscale_images else 3)) | |
return img | |
def get_dataset(dataset_path='Dataset'): | |
# Getting all data from data path: | |
try: | |
X = np.load('npy_dataset/X.npy') | |
Y = np.load('npy_dataset/Y.npy') | |
except: | |
labels = listdir(dataset_path) # Geting labels | |
X = [] | |
Y = [] | |
for i, label in enumerate(labels): | |
datas_path = dataset_path+'/'+label | |
for data in listdir(datas_path): | |
img = get_img(datas_path+'/'+data) | |
X.append(img) | |
Y.append(i) | |
# Create dateset: | |
X = 1-np.array(X).astype('float32')/255. | |
Y = np.array(Y).astype('float32') | |
Y = to_categorical(Y, num_class) | |
if not os.path.exists('npy_dataset/'): | |
os.makedirs('npy_dataset/') | |
np.save('npy_dataset/X.npy', X) | |
np.save('npy_dataset/Y.npy', Y) | |
X, X_test, Y, Y_test = train_test_split(X, Y, test_size=test_size, random_state=42) | |
return X, X_test, Y, Y_test | |
if __name__ == '__main__': | |
get_dataset() |
Because list_dir()
does not return file paths in a certain order, there might be a problem with the matching of images and labels as some users have noted. The origin of error is line 31 where enumerate()
is applied in the unordered list of labels, resulting in the index i
and label
to be inconsistent. There are two solutions I can suggest:
-
The one referenced by @felipheggaliza which is to change line 36 from
Y.append(i)
toY.append(int(label))
. This also drops the necessity for usingenumerate()
, so you can change line 31 to:for label in labels:
. -
The other solution noted in @BenjaminFraser's fork where he sorts the output list of
listdir()
in line 28.
I think some of the functions are removed from the used libraries, so this is an updated version. Hope it works.
import os
import numpy as np
from os import listdir
from matplotlib.pyplot import imread
from skimage.transform import resize
from keras.utils import to_categorical
from sklearn.model_selection import train_test_split
# Settings:
img_size = 64
grayscale_images = True
num_class = 10
test_size = 0.2
def get_img(data_path):
# Getting image array from path:
img = imread(data_path)
img = resize(img, (img_size, img_size, 1 if grayscale_images else 3))
return img
def get_dataset(dataset_path='Dataset'):
# Getting all data from data path:
try:
X = np.load('npy_dataset/X.npy')
Y = np.load('npy_dataset/Y.npy')
except:
labels = listdir(dataset_path) # Geting labels
X = []
Y = []
for i, label in enumerate(labels):
datas_path = dataset_path + '/' + label
for data in listdir(datas_path):
img = get_img(datas_path + '/' + data)
X.append(img)
Y.append(i)
# Create dateset:
X = 1 - np.array(X).astype('float32') / 255.
Y = np.array(Y).astype('float32')
Y = to_categorical(Y, num_class)
if not os.path.exists('npy_dataset/'):
os.makedirs('npy_dataset/')
np.save('npy_dataset/X.npy', X)
np.save('npy_dataset/Y.npy', Y)
X, X_test, Y, Y_test = train_test_split(X, Y, test_size=test_size, random_state=42)
return X, X_test, Y, Y_test
if __name__ == '__main__':
X, X_test, Y, Y_test = get_dataset()
print(X)
print(X_test)
print(Y)
print(Y_test)
I think some of the functions are removed from the used libraries, so this is an updated version. Hope it works.
import os import numpy as np from os import listdir from matplotlib.pyplot import imread from skimage.transform import resize from keras.utils import to_categorical from sklearn.model_selection import train_test_split # Settings: img_size = 64 grayscale_images = True num_class = 10 test_size = 0.2 def get_img(data_path): # Getting image array from path: img = imread(data_path) img = resize(img, (img_size, img_size, 1 if grayscale_images else 3)) return img def get_dataset(dataset_path='Dataset'): # Getting all data from data path: try: X = np.load('npy_dataset/X.npy') Y = np.load('npy_dataset/Y.npy') except: labels = listdir(dataset_path) # Geting labels X = [] Y = [] for i, label in enumerate(labels): datas_path = dataset_path + '/' + label for data in listdir(datas_path): img = get_img(datas_path + '/' + data) X.append(img) Y.append(i) # Create dateset: X = 1 - np.array(X).astype('float32') / 255. Y = np.array(Y).astype('float32') Y = to_categorical(Y, num_class) if not os.path.exists('npy_dataset/'): os.makedirs('npy_dataset/') np.save('npy_dataset/X.npy', X) np.save('npy_dataset/Y.npy', Y) X, X_test, Y, Y_test = train_test_split(X, Y, test_size=test_size, random_state=42) return X, X_test, Y, Y_test if __name__ == '__main__': X, X_test, Y, Y_test = get_dataset() print(X) print(X_test) print(Y) print(Y_test)
To make this code works fine, you should change the following lines:
- Change the line "labels = listdir(dataset_path)" with "labels = ['0','1','2','3','4','5','6','7','8','9']"
- Change the line "X = 1 - np.array(X).astype('float32') / 255." with "X = np.array(X).astype('float32')"
how to download this dataset " sign language digit dataset".
I tried this code on google colab, but it gave errors.
Please help
how to download this dataset " sign language digit dataset".
I tried this code on google colab, but it gave errors.
Please help
You can use Kaggle API to download and use the dataset.
Please, fix the script. I was stuck the whole day trying to figure out why my model working wrong. The assigned labels differ from ground truth