Skip to content

Instantly share code, notes, and snippets.

@ardamavi
Last active June 7, 2023 02:20
Show Gist options
  • Save ardamavi/a7d06ff8a315308771c70006cf494d69 to your computer and use it in GitHub Desktop.
Save ardamavi/a7d06ff8a315308771c70006cf494d69 to your computer and use it in GitHub Desktop.
For reading datasets and converting to numpy files.
# Arda Mavi
import os
import numpy as np
from os import listdir
from scipy.misc import imread, imresize
from keras.utils import to_categorical
from sklearn.model_selection import train_test_split
# Settings:
img_size = 64
grayscale_images = True
num_class = 10
test_size = 0.2
def get_img(data_path):
# Getting image array from path:
img = imread(data_path, flatten=grayscale_images)
img = imresize(img, (img_size, img_size, 1 if grayscale_images else 3))
return img
def get_dataset(dataset_path='Dataset'):
# Getting all data from data path:
try:
X = np.load('npy_dataset/X.npy')
Y = np.load('npy_dataset/Y.npy')
except:
labels = listdir(dataset_path) # Geting labels
X = []
Y = []
for i, label in enumerate(labels):
datas_path = dataset_path+'/'+label
for data in listdir(datas_path):
img = get_img(datas_path+'/'+data)
X.append(img)
Y.append(i)
# Create dateset:
X = 1-np.array(X).astype('float32')/255.
Y = np.array(Y).astype('float32')
Y = to_categorical(Y, num_class)
if not os.path.exists('npy_dataset/'):
os.makedirs('npy_dataset/')
np.save('npy_dataset/X.npy', X)
np.save('npy_dataset/Y.npy', Y)
X, X_test, Y, Y_test = train_test_split(X, Y, test_size=test_size, random_state=42)
return X, X_test, Y, Y_test
if __name__ == '__main__':
get_dataset()
@BrodaUa
Copy link

BrodaUa commented Oct 29, 2018

Please, fix the script. I was stuck the whole day trying to figure out why my model working wrong. The assigned labels differ from ground truth

image

@panosgemos
Copy link

Because list_dir() does not return file paths in a certain order, there might be a problem with the matching of images and labels as some users have noted. The origin of error is line 31 where enumerate() is applied in the unordered list of labels, resulting in the index i and label to be inconsistent. There are two solutions I can suggest:

  1. The one referenced by @felipheggaliza which is to change line 36 from Y.append(i) to Y.append(int(label)). This also drops the necessity for using enumerate(), so you can change line 31 to: for label in labels: .

  2. The other solution noted in @BenjaminFraser's fork where he sorts the output list of listdir() in line 28.

@prime-hacker
Copy link

prime-hacker commented Dec 21, 2019

I think some of the functions are removed from the used libraries, so this is an updated version. Hope it works.

import os
import numpy as np
from os import listdir
from matplotlib.pyplot import imread
from skimage.transform import resize
from keras.utils import to_categorical
from sklearn.model_selection import train_test_split


# Settings:
img_size = 64
grayscale_images = True
num_class = 10
test_size = 0.2


def get_img(data_path):
    # Getting image array from path:
    img = imread(data_path)
    img = resize(img, (img_size, img_size, 1 if grayscale_images else 3))
    return img


def get_dataset(dataset_path='Dataset'):
    # Getting all data from data path:
    try:
        X = np.load('npy_dataset/X.npy')
        Y = np.load('npy_dataset/Y.npy')
    except:
        labels = listdir(dataset_path)  # Geting labels
        X = []
        Y = []
        for i, label in enumerate(labels):
            datas_path = dataset_path + '/' + label
            for data in listdir(datas_path):
                img = get_img(datas_path + '/' + data)
                X.append(img)
                Y.append(i)
        # Create dateset:
        X = 1 - np.array(X).astype('float32') / 255.
        Y = np.array(Y).astype('float32')
        Y = to_categorical(Y, num_class)
        if not os.path.exists('npy_dataset/'):
            os.makedirs('npy_dataset/')
        np.save('npy_dataset/X.npy', X)
        np.save('npy_dataset/Y.npy', Y)
    X, X_test, Y, Y_test = train_test_split(X, Y, test_size=test_size, random_state=42)
    return X, X_test, Y, Y_test


if __name__ == '__main__':
    X, X_test, Y, Y_test = get_dataset()
    print(X)
    print(X_test)
    print(Y)
    print(Y_test)

@Ehsan-Yaghoubi
Copy link

Ehsan-Yaghoubi commented Jun 19, 2020

I think some of the functions are removed from the used libraries, so this is an updated version. Hope it works.

import os
import numpy as np
from os import listdir
from matplotlib.pyplot import imread
from skimage.transform import resize
from keras.utils import to_categorical
from sklearn.model_selection import train_test_split


# Settings:
img_size = 64
grayscale_images = True
num_class = 10
test_size = 0.2


def get_img(data_path):
    # Getting image array from path:
    img = imread(data_path)
    img = resize(img, (img_size, img_size, 1 if grayscale_images else 3))
    return img


def get_dataset(dataset_path='Dataset'):
    # Getting all data from data path:
    try:
        X = np.load('npy_dataset/X.npy')
        Y = np.load('npy_dataset/Y.npy')
    except:
        labels = listdir(dataset_path)  # Geting labels
        X = []
        Y = []
        for i, label in enumerate(labels):
            datas_path = dataset_path + '/' + label
            for data in listdir(datas_path):
                img = get_img(datas_path + '/' + data)
                X.append(img)
                Y.append(i)
        # Create dateset:
        X = 1 - np.array(X).astype('float32') / 255.
        Y = np.array(Y).astype('float32')
        Y = to_categorical(Y, num_class)
        if not os.path.exists('npy_dataset/'):
            os.makedirs('npy_dataset/')
        np.save('npy_dataset/X.npy', X)
        np.save('npy_dataset/Y.npy', Y)
    X, X_test, Y, Y_test = train_test_split(X, Y, test_size=test_size, random_state=42)
    return X, X_test, Y, Y_test


if __name__ == '__main__':
    X, X_test, Y, Y_test = get_dataset()
    print(X)
    print(X_test)
    print(Y)
    print(Y_test)

To make this code works fine, you should change the following lines:

  1. Change the line "labels = listdir(dataset_path)" with "labels = ['0','1','2','3','4','5','6','7','8','9']"
  2. Change the line "X = 1 - np.array(X).astype('float32') / 255." with "X = np.array(X).astype('float32')"

@BhoomiBM
Copy link

how to download this dataset " sign language digit dataset".
I tried this code on google colab, but it gave errors.
Please help

@ardamavi
Copy link
Author

how to download this dataset " sign language digit dataset".
I tried this code on google colab, but it gave errors.
Please help

You can use Kaggle API to download and use the dataset.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment