b6nrb56g9p-ship-it · November 6, 2025 14:57 · b6nrb56g9p-ship-it · Nov 6, 2025
diff --git a/data.py b/data.py
 # -- assignment 1 --
 import numpy as np
 from urllib import request
 import gzip
 import pickle
 import os

 def load_synth(num_train=60_000, num_val=10_000, seed=0):
    """
    Load some very basic synthetic data that should be easy to classify. Two features, so that we can plot the
    decision boundary (which is an ellipse in the feature space).

    :param num_train: Number of training instances
    :param num_val: Number of test/validation instances
    :param num_features: Number of features per instance

    :return: Two tuples and an integer: (xtrain, ytrain), (xval, yval), num_cls. The first contains a matrix of training
     data with 2 features as a numpy floating point array, and the corresponding classification labels as a numpy
     integer array. The second contains the test/validation data in the same format. The last integer contains the
     number of classes (this is always 2 for this function).
    """
    np.random.seed(seed)

    THRESHOLD = 0.6
    quad = np.asarray([[1, -0.05], [1, .4]])

    ntotal = num_train + num_val

    x = np.random.randn(ntotal, 2)

    # compute the quadratic form
    q = np.einsum('bf, fk, bk -> b', x, quad, x)
    y = (q > THRESHOLD).astype(int)

    return (x[:num_train, :], y[:num_train]), (x[num_train:, :], y[num_train:]), 2

 def load_mnist(final=False, flatten=True, shuffle_seed=0):
    """
    Load the MNIST data.

    :param final: If true, return the canonical test/train split. If false, split some validation data from the training
       data and keep the test data hidden.
    :param flatten: If true, each instance is flattened into a vector, so that the data is returns as a matrix with 768
        columns. If false, the data is returned as a 3-tensor preserving each image as a matrix.
    :param shuffle_seed If >= 0, the data is shuffled. This keeps the canonical test/train split, but shuffles each
        internally before splitting off a validation set. The given number is used as a seed. Note that the original data
        is _not_ shuffled, but ordered by writer. This means that there will be a distribution shift between train and val
        if the data is not shuffled.

    :return: Two tuples and an integer: (xtrain, ytrain), (xval, yval), num_cls. The first contains a matrix of training
     data and the corresponding classification labels as a numpy integer array. The second contains the test/validation
     data in the same format. The last integer contains the number of classes (this is always 2 for this function).

     """

    if not os.path.isfile('mnist.pkl'):
        init()

    xtrain, ytrain, xtest, ytest = load()
    xtl, xsl = xtrain.shape[0], xtest.shape[0]

    if flatten:
        xtrain = xtrain.reshape(xtl, -1)
        xtest  = xtest.reshape(xsl, -1)

    if shuffle_seed >= 0:
        rng = np.random.default_rng(shuffle_seed)

        p = rng.permutation(xtrain.shape[0])
        xtrain, ytrain = xtrain[p], ytrain[p]

        p = rng.permutation(xtest.shape[0])
        xtest, ytest = xtest[p], ytest[p]

    if not final: # return the flattened images
        return (xtrain[:-5000], ytrain[:-5000]), (xtrain[-5000:], ytrain[-5000:]), 10

    return (xtrain, ytrain), (xtest, ytest), 10

 # Numpy-only MNIST loader. Courtesy of Hyeonseok Jung
 # https://github.com/hsjeong5/MNIST-for-Numpy

 # Updated list of local files (not .gz files anymore)
 filename = [
    ["training_images", "train-images.idx3-ubyte"],
    ["test_images", "t10k-images.idx3-ubyte"],
    ["training_labels", "train-labels.idx1-ubyte"],
    ["test_labels", "t10k-labels.idx1-ubyte"]
 ]

 def save_mnist():
    mnist = {}

    FOLDER = "archive/"

    # Read image files (skip first 16 bytes header)
    for name in filename[:2]:
        with open(os.path.join(FOLDER, name[1]), 'rb') as f:
            mnist[name[0]] = np.frombuffer(f.read(), np.uint8, offset=16).reshape(-1, 28*28)

    # Read label files (skip first 8 bytes header)
    for name in filename[-2:]:
        with open(os.path.join(FOLDER, name[1]), 'rb') as f:
            mnist[name[0]] = np.frombuffer(f.read(), np.uint8, offset=8)

    # Save everything in one pickle file
    with open("mnist.pkl", 'wb') as f:
        pickle.dump(mnist, f)

    print("mnist.pkl created successfully!")

 def load():
    with open("mnist.pkl", 'rb') as f:
        mnist = pickle.load(f)
    return mnist["training_images"], mnist["training_labels"], mnist["test_images"], mnist["test_labels"]

 # Run this to generate mnist.pkl
 save_mnist()

 X_train, y_train, X_test, y_test = load()
 print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)
	# -- assignment 1 --
	import numpy as np
	from urllib import request
	import gzip
	import pickle
	import os

	def load_synth(num_train=60_000, num_val=10_000, seed=0):
	"""
	Load some very basic synthetic data that should be easy to classify. Two features, so that we can plot the
	decision boundary (which is an ellipse in the feature space).

	:param num_train: Number of training instances
	:param num_val: Number of test/validation instances
	:param num_features: Number of features per instance

	:return: Two tuples and an integer: (xtrain, ytrain), (xval, yval), num_cls. The first contains a matrix of training
	data with 2 features as a numpy floating point array, and the corresponding classification labels as a numpy
	integer array. The second contains the test/validation data in the same format. The last integer contains the
	number of classes (this is always 2 for this function).
	"""
	np.random.seed(seed)

	THRESHOLD = 0.6
	quad = np.asarray([[1, -0.05], [1, .4]])

	ntotal = num_train + num_val

	x = np.random.randn(ntotal, 2)

	# compute the quadratic form
	q = np.einsum('bf, fk, bk -> b', x, quad, x)
	y = (q > THRESHOLD).astype(int)

	return (x[:num_train, :], y[:num_train]), (x[num_train:, :], y[num_train:]), 2

	def load_mnist(final=False, flatten=True, shuffle_seed=0):
	"""
	Load the MNIST data.

	:param final: If true, return the canonical test/train split. If false, split some validation data from the training
	data and keep the test data hidden.
	:param flatten: If true, each instance is flattened into a vector, so that the data is returns as a matrix with 768
	columns. If false, the data is returned as a 3-tensor preserving each image as a matrix.
	:param shuffle_seed If >= 0, the data is shuffled. This keeps the canonical test/train split, but shuffles each
	internally before splitting off a validation set. The given number is used as a seed. Note that the original data
	is _not_ shuffled, but ordered by writer. This means that there will be a distribution shift between train and val
	if the data is not shuffled.

	:return: Two tuples and an integer: (xtrain, ytrain), (xval, yval), num_cls. The first contains a matrix of training
	data and the corresponding classification labels as a numpy integer array. The second contains the test/validation
	data in the same format. The last integer contains the number of classes (this is always 2 for this function).

	"""

	if not os.path.isfile('mnist.pkl'):
	init()

	xtrain, ytrain, xtest, ytest = load()
	xtl, xsl = xtrain.shape[0], xtest.shape[0]

	if flatten:
	xtrain = xtrain.reshape(xtl, -1)
	xtest = xtest.reshape(xsl, -1)

	if shuffle_seed >= 0:
	rng = np.random.default_rng(shuffle_seed)

	p = rng.permutation(xtrain.shape[0])
	xtrain, ytrain = xtrain[p], ytrain[p]

	p = rng.permutation(xtest.shape[0])
	xtest, ytest = xtest[p], ytest[p]

	if not final: # return the flattened images
	return (xtrain[:-5000], ytrain[:-5000]), (xtrain[-5000:], ytrain[-5000:]), 10

	return (xtrain, ytrain), (xtest, ytest), 10

	# Numpy-only MNIST loader. Courtesy of Hyeonseok Jung
	# https://github.com/hsjeong5/MNIST-for-Numpy

	# Updated list of local files (not .gz files anymore)
	filename = [
	["training_images", "train-images.idx3-ubyte"],
	["test_images", "t10k-images.idx3-ubyte"],
	["training_labels", "train-labels.idx1-ubyte"],
	["test_labels", "t10k-labels.idx1-ubyte"]
	]

	def save_mnist():
	mnist = {}

	FOLDER = "archive/"

	# Read image files (skip first 16 bytes header)
	for name in filename[:2]:
	with open(os.path.join(FOLDER, name[1]), 'rb') as f:
	mnist[name[0]] = np.frombuffer(f.read(), np.uint8, offset=16).reshape(-1, 28*28)

	# Read label files (skip first 8 bytes header)
	for name in filename[-2:]:
	with open(os.path.join(FOLDER, name[1]), 'rb') as f:
	mnist[name[0]] = np.frombuffer(f.read(), np.uint8, offset=8)

	# Save everything in one pickle file
	with open("mnist.pkl", 'wb') as f:
	pickle.dump(mnist, f)

	print("mnist.pkl created successfully!")

	def load():
	with open("mnist.pkl", 'rb') as f:
	mnist = pickle.load(f)
	return mnist["training_images"], mnist["training_labels"], mnist["test_images"], mnist["test_labels"]

	# Run this to generate mnist.pkl
	save_mnist()

	X_train, y_train, X_test, y_test = load()
	print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)
No results found