Skip to content

Instantly share code, notes, and snippets.

@b6nrb56g9p-ship-it
Forked from pbloem/data.py
Last active November 6, 2025 14:57
Show Gist options
  • Select an option

  • Save b6nrb56g9p-ship-it/ec538e6aed07b8d9840dccee98a52b4b to your computer and use it in GitHub Desktop.

Select an option

Save b6nrb56g9p-ship-it/ec538e6aed07b8d9840dccee98a52b4b to your computer and use it in GitHub Desktop.
# -- assignment 1 --
import numpy as np
from urllib import request
import gzip
import pickle
import os
def load_synth(num_train=60_000, num_val=10_000, seed=0):
"""
Load some very basic synthetic data that should be easy to classify. Two features, so that we can plot the
decision boundary (which is an ellipse in the feature space).
:param num_train: Number of training instances
:param num_val: Number of test/validation instances
:param num_features: Number of features per instance
:return: Two tuples and an integer: (xtrain, ytrain), (xval, yval), num_cls. The first contains a matrix of training
data with 2 features as a numpy floating point array, and the corresponding classification labels as a numpy
integer array. The second contains the test/validation data in the same format. The last integer contains the
number of classes (this is always 2 for this function).
"""
np.random.seed(seed)
THRESHOLD = 0.6
quad = np.asarray([[1, -0.05], [1, .4]])
ntotal = num_train + num_val
x = np.random.randn(ntotal, 2)
# compute the quadratic form
q = np.einsum('bf, fk, bk -> b', x, quad, x)
y = (q > THRESHOLD).astype(int)
return (x[:num_train, :], y[:num_train]), (x[num_train:, :], y[num_train:]), 2
def load_mnist(final=False, flatten=True, shuffle_seed=0):
"""
Load the MNIST data.
:param final: If true, return the canonical test/train split. If false, split some validation data from the training
data and keep the test data hidden.
:param flatten: If true, each instance is flattened into a vector, so that the data is returns as a matrix with 768
columns. If false, the data is returned as a 3-tensor preserving each image as a matrix.
:param shuffle_seed If >= 0, the data is shuffled. This keeps the canonical test/train split, but shuffles each
internally before splitting off a validation set. The given number is used as a seed. Note that the original data
is _not_ shuffled, but ordered by writer. This means that there will be a distribution shift between train and val
if the data is not shuffled.
:return: Two tuples and an integer: (xtrain, ytrain), (xval, yval), num_cls. The first contains a matrix of training
data and the corresponding classification labels as a numpy integer array. The second contains the test/validation
data in the same format. The last integer contains the number of classes (this is always 2 for this function).
"""
if not os.path.isfile('mnist.pkl'):
init()
xtrain, ytrain, xtest, ytest = load()
xtl, xsl = xtrain.shape[0], xtest.shape[0]
if flatten:
xtrain = xtrain.reshape(xtl, -1)
xtest = xtest.reshape(xsl, -1)
if shuffle_seed >= 0:
rng = np.random.default_rng(shuffle_seed)
p = rng.permutation(xtrain.shape[0])
xtrain, ytrain = xtrain[p], ytrain[p]
p = rng.permutation(xtest.shape[0])
xtest, ytest = xtest[p], ytest[p]
if not final: # return the flattened images
return (xtrain[:-5000], ytrain[:-5000]), (xtrain[-5000:], ytrain[-5000:]), 10
return (xtrain, ytrain), (xtest, ytest), 10
# Numpy-only MNIST loader. Courtesy of Hyeonseok Jung
# https://github.com/hsjeong5/MNIST-for-Numpy
# Updated list of local files (not .gz files anymore)
filename = [
["training_images", "train-images.idx3-ubyte"],
["test_images", "t10k-images.idx3-ubyte"],
["training_labels", "train-labels.idx1-ubyte"],
["test_labels", "t10k-labels.idx1-ubyte"]
]
def save_mnist():
mnist = {}
FOLDER = "archive/"
# Read image files (skip first 16 bytes header)
for name in filename[:2]:
with open(os.path.join(FOLDER, name[1]), 'rb') as f:
mnist[name[0]] = np.frombuffer(f.read(), np.uint8, offset=16).reshape(-1, 28*28)
# Read label files (skip first 8 bytes header)
for name in filename[-2:]:
with open(os.path.join(FOLDER, name[1]), 'rb') as f:
mnist[name[0]] = np.frombuffer(f.read(), np.uint8, offset=8)
# Save everything in one pickle file
with open("mnist.pkl", 'wb') as f:
pickle.dump(mnist, f)
print("mnist.pkl created successfully!")
def load():
with open("mnist.pkl", 'rb') as f:
mnist = pickle.load(f)
return mnist["training_images"], mnist["training_labels"], mnist["test_images"], mnist["test_labels"]
# Run this to generate mnist.pkl
save_mnist()
X_train, y_train, X_test, y_test = load()
print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)
@b6nrb56g9p-ship-it
Copy link
Author

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment