XinyueZ · January 8, 2022 22:01
diff --git a/ml_softmaxRegression.m b/ml_softmaxRegression.m
 disp("multi-class classification with softmax regression from scratch......");

 function [h, display_array] = displayData(X, example_width)
    %DISPLAYDATA Display 2D data in a nice grid
    %   [h, display_array] = DISPLAYDATA(X, example_width) displays 2D data
    %   stored in X in a nice grid. It returns the figure handle h and the
    %   displayed array if requested.

    % Set example_width automatically if not passed in
    if ~exist('example_width', 'var') || isempty(example_width)
        example_width = round(sqrt(size(X, 2)));
    end

    % Gray Image
    colormap(gray);

    % Compute rows, cols
    [m n] = size(X);
    example_height = (n / example_width);

    % Compute number of items to display
    display_rows = floor(sqrt(m));
    display_cols = ceil(m / display_rows);

    % Between images padding
    pad = 1;

    % Setup blank display
    display_array =- ones(pad + display_rows * (example_height + pad), ...
        pad + display_cols * (example_width + pad));

    % Copy each example into a patch on the display array
    curr_ex = 1;

    for j = 1:display_rows

        for i = 1:display_cols

            if curr_ex > m,
                break;
            end

            % Copy the patch

            % Get the max value of the patch
            max_val = max(abs(X(curr_ex, :)));
            display_array(pad + (j - 1) * (example_height + pad) + (1:example_height), ...
                pad + (i - 1) * (example_width + pad) + (1:example_width)) = ...
                reshape(X(curr_ex, :), example_height, example_width) / max_val;
            curr_ex = curr_ex + 1;
        end

        if curr_ex > m,
            break;
        end

    end

    % Display Image
    h = imagesc(display_array, [-1 1]);

    % Do not show axis
    axis image off

    drawnow;
 endfunction

 function g = sigmoid(z)
    g = 1.0 ./ (1.0 + exp(-z));
 endfunction

 function g = sigmoidGradient(z)
    g = sigmoid(z) .* (1 - sigmoid(z));
 endfunction

 function W = randInitializeWeights(L_in, L_out)
    epsilon_init = sqrt(6) / sqrt(L_in + L_out);
    W = rand(L_out, 1 + L_in) * 2 * epsilon_init - epsilon_init;
 endfunction

 function g = softmax(z)
    g = exp(z) ./ sum(exp(z), 2);
 endfunction

 function [J grad] = nnCostFunction(nn_params, ...
        input_layer_size, ...
        hidden_layer_size, ...
        num_labels, ...
        X, y, lambda)

    Theta1 = reshape(nn_params(1:hidden_layer_size * (input_layer_size + 1)), ...
        hidden_layer_size, (input_layer_size + 1));

    Theta2 = reshape(nn_params((1 + (hidden_layer_size * (input_layer_size + 1))):end), ...
        num_labels, (hidden_layer_size + 1));

    m = size(X, 1);

    J = 0;
    Theta1_grad = zeros(size(Theta1));
    Theta2_grad = zeros(size(Theta2));

    X = [ones(m, 1) X];
    y = (y == 1:num_labels);

    Z_2 = X * Theta1';
    L_2 = sigmoid(Z_2);
    L_2 = [ones(m, 1) L_2];

    Z_3 = L_2 * Theta2';
    L_3 = softmax(Z_3);

    J = (1 / m) * sum(sum(-y .* log(L_3), 2), 1);

    Loss_3 = L_3 - y;
    Loss_2 = Loss_3 * Theta2(:, 2:end) .* sigmoidGradient(Z_2);

    Theta2_grad = Loss_3' * L_2;
    Theta1_grad = Loss_2' * X;

    Theta2_zero_bias = Theta2;
    Theta2_zero_bias(:, 1) = 0;
    Theta1_zero_bias = Theta1;
    Theta1_zero_bias(:, 1) = 0;

    J_reg = 0;
    J_reg = J_reg + sum(sum(Theta1_zero_bias .* Theta1_zero_bias));
    J_reg = J_reg + sum(sum(Theta2_zero_bias .* Theta2_zero_bias));
    J_reg = (lambda / (2 * m)) * J_reg;
    J = J + J_reg;

    Theta2_grad = (1 / m) * (Theta2_grad + lambda * Theta2_zero_bias);
    Theta1_grad = (1 / m) * (Theta1_grad + lambda * Theta1_zero_bias);

    % Flatten gradients
    grad = [Theta1_grad(:); Theta2_grad(:)];
 endfunction

 function numericalGrad = computeNumericalGradient(J, nn_params)
    m = size(nn_params, 1);
    mask = eye(m, m);
    e = 1e-4;

    nn_params = repmat(nn_params, 1, m);
    deviation = mask .* e;

    iter = 1:m;

    diff1 = nn_params - deviation;
    loss1 = arrayfun(@(i)J(diff1(:, i)), iter, 'UniformOutput', false); % https://stackoverflow.com/a/10898162/1835650
    loss1 = [loss1{:}] .* mask;

    diff2 = nn_params + deviation;
    loss2 = arrayfun(@(i)J(diff2(:, i)), iter, 'UniformOutput', false);
    loss2 = [loss2{:}] .* mask;

    numericalGrad = sum((loss2 - loss1), 2);
    numericalGrad = numericalGrad ./ (2 * e);

 endfunction

 function checkGradients(nn_params, costFunction)
    [cost, grad] = costFunction(nn_params);
    numericalGrad = computeNumericalGradient(costFunction, nn_params);
    diff = norm(numericalGrad - grad) / norm(numericalGrad + grad);

    checkSign = "𐄂";

    if (diff < 1e-9)
        checkSign = "✓";
    endif

    printf(['\nGradients check difference(less than 1e-9).: %g (%s)\n'], diff, checkSign);

 endfunction

 function [nn_params, cost, info, output, grad, hess] = fit_nn(X, y, epoch, maxIter, input_layer_size, hidden_layer_size, num_labels, nn_params, lambda, checkGradientsFunc)
    options = optimset('GradObj', 'on', 'MaxIter', maxIter, "Display", 'iter');
    costFunction = @(p) nnCostFunction(p, input_layer_size, hidden_layer_size, num_labels, X, y, lambda);

    if exist('checkGradientsFunc', 'var') && epoch < 2
        checkGradientsFunc(nn_params, costFunction);
    endif

    [nn_params, cost, info, output, grad, hess] = fminunc(costFunction, nn_params, options);
 endfunction

 function [nn_params, cost, train_cost, validation_cost] = fit_step(X, y, X_cv, y_cv, EPOCHS, epoch, batch_iter, batch_start, batch_end, batch_size, maxIter, input_layer_size, hidden_layer_size, num_labels, nn_params, lambda, checkGradientsFunc)
    [nn_params, cost, ~] = fit_nn(X, y, epoch, maxIter, input_layer_size, hidden_layer_size, num_labels, nn_params, lambda, checkGradientsFunc);
    [~, train_cost, ~] = fit_nn(X, y, epoch, maxIter, input_layer_size, hidden_layer_size, num_labels, nn_params, 0);
    [~, validation_cost, ~] = fit_nn(X_cv, y_cv, epoch, maxIter, input_layer_size, hidden_layer_size, num_labels, nn_params, 0);

    printf('Epoch: %6i/%i | batch: %6i | start: %6i | end: %6i | size: %6i | cost: %4.6f | train cost: %4.6f | validation cost: %4.6f \n', epoch, EPOCHS, batch_iter, batch_start, batch_end, batch_size, cost, train_cost, validation_cost);
 endfunction

 function [X_shuffled, y_shuffled] = shuffle(X, y)
    shuffle_order = randperm(size(X, 1));

    X_shuffled = X(shuffle_order, :);
    y_shuffled = y(shuffle_order);
 endfunction

 function main()
    load('minist.mat');

    m = size(X, 1);
    sel = randperm(size(X, 1)); % Randomly select 100 data points to display
    sel = sel(1:100);
    displayData(X(sel, :));

    input_layer_size = 400; % 20x20 Input Images of Digits
    hidden_layer_size = 25; % 25 hidden units
    num_labels = 10; % 10 labels, from 1 to 10 (note that we have mapped "0" to label 10)
    lambda = 1;
    maxIter = 50;

    batch_size = 1024;
    EPOCHS = 32;

    sp = (m * 0.8);
    X_train = X(1:sp, :);
    y_train = y(1:sp);
    X_cv = X(m - sp:end, :);
    y_cv = y(m - sp:end);

    nn_params_1 = randInitializeWeights(input_layer_size, hidden_layer_size);
    nn_params_2 = randInitializeWeights(hidden_layer_size, num_labels);
    nn_params = [nn_params_1(:); nn_params_2(:)]; % Flatten parameters

    total_cost = 0;
    total_train_cost = 0;
    total_validation_cost = 0;

    for epoch = 1:EPOCHS
        batch_cost = 0;
        batch_train_cost = 0;
        batch_validation_cost = 0;

        batch_iter = 1;
        batch_start = 1;

        [X_train, y_train] = shuffle(X_train, y_train);

        while true
            X_batch = [];
            y_batch = [];
            batch_end = batch_start + batch_size;

            if batch_end > sp
                X_batch = X_train(batch_start:end, :);
                y_batch = y_train(batch_start:end);
            else
                X_batch = X_train(batch_start:batch_end, :);
                y_batch = y_train(batch_start:batch_end);
            endif

            if size(X_batch, 1) == 0
                break;
            endif

            [nn_params, cost, train_cost, validation_cost] = fit_step(X_batch, y_batch, X_cv, y_cv, EPOCHS, epoch, batch_iter, batch_start, batch_end, batch_size, maxIter, input_layer_size, hidden_layer_size, num_labels, nn_params, lambda, @checkGradients);

            batch_cost = batch_cost + cost;
            batch_train_cost = batch_train_cost + train_cost;
            batch_validation_cost = batch_validation_cost + validation_cost;

            batch_start = batch_start + batch_size;
            batch_iter = batch_iter + 1;

            if batch_start > sp
                break;
            endif

        endwhile

        total_steps = max(1, batch_iter - 1);
        batch_cost = (batch_cost / total_steps);
        batch_train_cost = (batch_train_cost / total_steps);
        batch_validation_cost = (batch_validation_cost / total_steps);
        printf('\n#Mean [cost: %4.6f train cost: %4.6f validation cost: %4.6f] \n\n', batch_cost, batch_train_cost, batch_validation_cost);

        total_cost = total_cost + batch_cost;
        total_train_cost = total_train_cost + batch_train_cost;
        total_validation_cost = total_validation_cost + batch_validation_cost;
    endfor

    printf('\n\n##Total: cost: %4.6f | train cost: %4.6f | validation cost: %4.6f \n\n', total_cost / EPOCHS, total_train_cost / EPOCHS, total_validation_cost / EPOCHS);
 endfunction

 main();
	disp("multi-class classification with softmax regression from scratch......");

	function [h, display_array] = displayData(X, example_width)
	%DISPLAYDATA Display 2D data in a nice grid
	% [h, display_array] = DISPLAYDATA(X, example_width) displays 2D data
	% stored in X in a nice grid. It returns the figure handle h and the
	% displayed array if requested.

	% Set example_width automatically if not passed in
	if ~exist('example_width', 'var') \|\| isempty(example_width)
	example_width = round(sqrt(size(X, 2)));
	end

	% Gray Image
	colormap(gray);

	% Compute rows, cols
	[m n] = size(X);
	example_height = (n / example_width);

	% Compute number of items to display
	display_rows = floor(sqrt(m));
	display_cols = ceil(m / display_rows);

	% Between images padding
	pad = 1;

	% Setup blank display
	display_array =- ones(pad + display_rows * (example_height + pad), ...
	pad + display_cols * (example_width + pad));

	% Copy each example into a patch on the display array
	curr_ex = 1;

	for j = 1:display_rows

	for i = 1:display_cols

	if curr_ex > m,
	break;
	end

	% Copy the patch

	% Get the max value of the patch
	max_val = max(abs(X(curr_ex, :)));
	display_array(pad + (j - 1) * (example_height + pad) + (1:example_height), ...
	pad + (i - 1) * (example_width + pad) + (1:example_width)) = ...
	reshape(X(curr_ex, :), example_height, example_width) / max_val;
	curr_ex = curr_ex + 1;
	end

	if curr_ex > m,
	break;
	end

	end

	% Display Image
	h = imagesc(display_array, [-1 1]);

	% Do not show axis
	axis image off

	drawnow;
	endfunction

	function g = sigmoid(z)
	g = 1.0 ./ (1.0 + exp(-z));
	endfunction

	function g = sigmoidGradient(z)
	g = sigmoid(z) .* (1 - sigmoid(z));
	endfunction

	function W = randInitializeWeights(L_in, L_out)
	epsilon_init = sqrt(6) / sqrt(L_in + L_out);
	W = rand(L_out, 1 + L_in) * 2 * epsilon_init - epsilon_init;
	endfunction

	function g = softmax(z)
	g = exp(z) ./ sum(exp(z), 2);
	endfunction

	function [J grad] = nnCostFunction(nn_params, ...
	input_layer_size, ...
	hidden_layer_size, ...
	num_labels, ...
	X, y, lambda)

	Theta1 = reshape(nn_params(1:hidden_layer_size * (input_layer_size + 1)), ...
	hidden_layer_size, (input_layer_size + 1));

	Theta2 = reshape(nn_params((1 + (hidden_layer_size * (input_layer_size + 1))):end), ...
	num_labels, (hidden_layer_size + 1));

	m = size(X, 1);

	J = 0;
	Theta1_grad = zeros(size(Theta1));
	Theta2_grad = zeros(size(Theta2));

	X = [ones(m, 1) X];
	y = (y == 1:num_labels);

	Z_2 = X * Theta1';
	L_2 = sigmoid(Z_2);
	L_2 = [ones(m, 1) L_2];

	Z_3 = L_2 * Theta2';
	L_3 = softmax(Z_3);

	J = (1 / m) * sum(sum(-y .* log(L_3), 2), 1);

	Loss_3 = L_3 - y;
	Loss_2 = Loss_3 * Theta2(:, 2:end) .* sigmoidGradient(Z_2);

	Theta2_grad = Loss_3' * L_2;
	Theta1_grad = Loss_2' * X;

	Theta2_zero_bias = Theta2;
	Theta2_zero_bias(:, 1) = 0;
	Theta1_zero_bias = Theta1;
	Theta1_zero_bias(:, 1) = 0;

	J_reg = 0;
	J_reg = J_reg + sum(sum(Theta1_zero_bias .* Theta1_zero_bias));
	J_reg = J_reg + sum(sum(Theta2_zero_bias .* Theta2_zero_bias));
	J_reg = (lambda / (2 * m)) * J_reg;
	J = J + J_reg;

	Theta2_grad = (1 / m) * (Theta2_grad + lambda * Theta2_zero_bias);
	Theta1_grad = (1 / m) * (Theta1_grad + lambda * Theta1_zero_bias);

	% Flatten gradients
	grad = [Theta1_grad(:); Theta2_grad(:)];
	endfunction

	function numericalGrad = computeNumericalGradient(J, nn_params)
	m = size(nn_params, 1);
	mask = eye(m, m);
	e = 1e-4;

	nn_params = repmat(nn_params, 1, m);
	deviation = mask .* e;

	iter = 1:m;

	diff1 = nn_params - deviation;
	loss1 = arrayfun(@(i)J(diff1(:, i)), iter, 'UniformOutput', false); % https://stackoverflow.com/a/10898162/1835650
	loss1 = [loss1{:}] .* mask;

	diff2 = nn_params + deviation;
	loss2 = arrayfun(@(i)J(diff2(:, i)), iter, 'UniformOutput', false);
	loss2 = [loss2{:}] .* mask;

	numericalGrad = sum((loss2 - loss1), 2);
	numericalGrad = numericalGrad ./ (2 * e);

	endfunction

	function checkGradients(nn_params, costFunction)
	[cost, grad] = costFunction(nn_params);
	numericalGrad = computeNumericalGradient(costFunction, nn_params);
	diff = norm(numericalGrad - grad) / norm(numericalGrad + grad);

	checkSign = "𐄂";

	if (diff < 1e-9)
	checkSign = "✓";
	endif

	printf(['\nGradients check difference(less than 1e-9).: %g (%s)\n'], diff, checkSign);

	endfunction

	function [nn_params, cost, info, output, grad, hess] = fit_nn(X, y, epoch, maxIter, input_layer_size, hidden_layer_size, num_labels, nn_params, lambda, checkGradientsFunc)
	options = optimset('GradObj', 'on', 'MaxIter', maxIter, "Display", 'iter');
	costFunction = @(p) nnCostFunction(p, input_layer_size, hidden_layer_size, num_labels, X, y, lambda);

	if exist('checkGradientsFunc', 'var') && epoch < 2
	checkGradientsFunc(nn_params, costFunction);
	endif

	[nn_params, cost, info, output, grad, hess] = fminunc(costFunction, nn_params, options);
	endfunction

	function [nn_params, cost, train_cost, validation_cost] = fit_step(X, y, X_cv, y_cv, EPOCHS, epoch, batch_iter, batch_start, batch_end, batch_size, maxIter, input_layer_size, hidden_layer_size, num_labels, nn_params, lambda, checkGradientsFunc)
	[nn_params, cost, ~] = fit_nn(X, y, epoch, maxIter, input_layer_size, hidden_layer_size, num_labels, nn_params, lambda, checkGradientsFunc);
	[~, train_cost, ~] = fit_nn(X, y, epoch, maxIter, input_layer_size, hidden_layer_size, num_labels, nn_params, 0);
	[~, validation_cost, ~] = fit_nn(X_cv, y_cv, epoch, maxIter, input_layer_size, hidden_layer_size, num_labels, nn_params, 0);

	printf('Epoch: %6i/%i \| batch: %6i \| start: %6i \| end: %6i \| size: %6i \| cost: %4.6f \| train cost: %4.6f \| validation cost: %4.6f \n', epoch, EPOCHS, batch_iter, batch_start, batch_end, batch_size, cost, train_cost, validation_cost);
	endfunction

	function [X_shuffled, y_shuffled] = shuffle(X, y)
	shuffle_order = randperm(size(X, 1));

	X_shuffled = X(shuffle_order, :);
	y_shuffled = y(shuffle_order);
	endfunction

	function main()
	load('minist.mat');

	m = size(X, 1);
	sel = randperm(size(X, 1)); % Randomly select 100 data points to display
	sel = sel(1:100);
	displayData(X(sel, :));

	input_layer_size = 400; % 20x20 Input Images of Digits
	hidden_layer_size = 25; % 25 hidden units
	num_labels = 10; % 10 labels, from 1 to 10 (note that we have mapped "0" to label 10)
	lambda = 1;
	maxIter = 50;

	batch_size = 1024;
	EPOCHS = 32;

	sp = (m * 0.8);
	X_train = X(1:sp, :);
	y_train = y(1:sp);
	X_cv = X(m - sp:end, :);
	y_cv = y(m - sp:end);

	nn_params_1 = randInitializeWeights(input_layer_size, hidden_layer_size);
	nn_params_2 = randInitializeWeights(hidden_layer_size, num_labels);
	nn_params = [nn_params_1(:); nn_params_2(:)]; % Flatten parameters

	total_cost = 0;
	total_train_cost = 0;
	total_validation_cost = 0;

	for epoch = 1:EPOCHS
	batch_cost = 0;
	batch_train_cost = 0;
	batch_validation_cost = 0;

	batch_iter = 1;
	batch_start = 1;

	[X_train, y_train] = shuffle(X_train, y_train);

	while true
	X_batch = [];
	y_batch = [];
	batch_end = batch_start + batch_size;

	if batch_end > sp
	X_batch = X_train(batch_start:end, :);
	y_batch = y_train(batch_start:end);
	else
	X_batch = X_train(batch_start:batch_end, :);
	y_batch = y_train(batch_start:batch_end);
	endif

	if size(X_batch, 1) == 0
	break;
	endif

	[nn_params, cost, train_cost, validation_cost] = fit_step(X_batch, y_batch, X_cv, y_cv, EPOCHS, epoch, batch_iter, batch_start, batch_end, batch_size, maxIter, input_layer_size, hidden_layer_size, num_labels, nn_params, lambda, @checkGradients);

	batch_cost = batch_cost + cost;
	batch_train_cost = batch_train_cost + train_cost;
	batch_validation_cost = batch_validation_cost + validation_cost;

	batch_start = batch_start + batch_size;
	batch_iter = batch_iter + 1;

	if batch_start > sp
	break;
	endif

	endwhile

	total_steps = max(1, batch_iter - 1);
	batch_cost = (batch_cost / total_steps);
	batch_train_cost = (batch_train_cost / total_steps);
	batch_validation_cost = (batch_validation_cost / total_steps);
	printf('\n#Mean [cost: %4.6f train cost: %4.6f validation cost: %4.6f] \n\n', batch_cost, batch_train_cost, batch_validation_cost);

	total_cost = total_cost + batch_cost;
	total_train_cost = total_train_cost + batch_train_cost;
	total_validation_cost = total_validation_cost + batch_validation_cost;
	endfor

	printf('\n\n##Total: cost: %4.6f \| train cost: %4.6f \| validation cost: %4.6f \n\n', total_cost / EPOCHS, total_train_cost / EPOCHS, total_validation_cost / EPOCHS);
	endfunction

	main();
No results found