Skip to content

Instantly share code, notes, and snippets.

@XinyueZ
Last active January 8, 2022 22:01
Show Gist options
  • Select an option

  • Save XinyueZ/ea5623e30b2f6eb1001acaa1265e7831 to your computer and use it in GitHub Desktop.

Select an option

Save XinyueZ/ea5623e30b2f6eb1001acaa1265e7831 to your computer and use it in GitHub Desktop.
Softmax regression
disp("multi-class classification with softmax regression from scratch......");
function [h, display_array] = displayData(X, example_width)
%DISPLAYDATA Display 2D data in a nice grid
% [h, display_array] = DISPLAYDATA(X, example_width) displays 2D data
% stored in X in a nice grid. It returns the figure handle h and the
% displayed array if requested.
% Set example_width automatically if not passed in
if ~exist('example_width', 'var') || isempty(example_width)
example_width = round(sqrt(size(X, 2)));
end
% Gray Image
colormap(gray);
% Compute rows, cols
[m n] = size(X);
example_height = (n / example_width);
% Compute number of items to display
display_rows = floor(sqrt(m));
display_cols = ceil(m / display_rows);
% Between images padding
pad = 1;
% Setup blank display
display_array =- ones(pad + display_rows * (example_height + pad), ...
pad + display_cols * (example_width + pad));
% Copy each example into a patch on the display array
curr_ex = 1;
for j = 1:display_rows
for i = 1:display_cols
if curr_ex > m,
break;
end
% Copy the patch
% Get the max value of the patch
max_val = max(abs(X(curr_ex, :)));
display_array(pad + (j - 1) * (example_height + pad) + (1:example_height), ...
pad + (i - 1) * (example_width + pad) + (1:example_width)) = ...
reshape(X(curr_ex, :), example_height, example_width) / max_val;
curr_ex = curr_ex + 1;
end
if curr_ex > m,
break;
end
end
% Display Image
h = imagesc(display_array, [-1 1]);
% Do not show axis
axis image off
drawnow;
endfunction
function g = sigmoid(z)
g = 1.0 ./ (1.0 + exp(-z));
endfunction
function g = sigmoidGradient(z)
g = sigmoid(z) .* (1 - sigmoid(z));
endfunction
function W = randInitializeWeights(L_in, L_out)
epsilon_init = sqrt(6) / sqrt(L_in + L_out);
W = rand(L_out, 1 + L_in) * 2 * epsilon_init - epsilon_init;
endfunction
function g = softmax(z)
g = exp(z) ./ sum(exp(z), 2);
endfunction
function [J grad] = nnCostFunction(nn_params, ...
input_layer_size, ...
hidden_layer_size, ...
num_labels, ...
X, y, lambda)
Theta1 = reshape(nn_params(1:hidden_layer_size * (input_layer_size + 1)), ...
hidden_layer_size, (input_layer_size + 1));
Theta2 = reshape(nn_params((1 + (hidden_layer_size * (input_layer_size + 1))):end), ...
num_labels, (hidden_layer_size + 1));
m = size(X, 1);
J = 0;
Theta1_grad = zeros(size(Theta1));
Theta2_grad = zeros(size(Theta2));
X = [ones(m, 1) X];
y = (y == 1:num_labels);
Z_2 = X * Theta1';
L_2 = sigmoid(Z_2);
L_2 = [ones(m, 1) L_2];
Z_3 = L_2 * Theta2';
L_3 = softmax(Z_3);
J = (1 / m) * sum(sum(-y .* log(L_3), 2), 1);
Loss_3 = L_3 - y;
Loss_2 = Loss_3 * Theta2(:, 2:end) .* sigmoidGradient(Z_2);
Theta2_grad = Loss_3' * L_2;
Theta1_grad = Loss_2' * X;
Theta2_zero_bias = Theta2;
Theta2_zero_bias(:, 1) = 0;
Theta1_zero_bias = Theta1;
Theta1_zero_bias(:, 1) = 0;
J_reg = 0;
J_reg = J_reg + sum(sum(Theta1_zero_bias .* Theta1_zero_bias));
J_reg = J_reg + sum(sum(Theta2_zero_bias .* Theta2_zero_bias));
J_reg = (lambda / (2 * m)) * J_reg;
J = J + J_reg;
Theta2_grad = (1 / m) * (Theta2_grad + lambda * Theta2_zero_bias);
Theta1_grad = (1 / m) * (Theta1_grad + lambda * Theta1_zero_bias);
% Flatten gradients
grad = [Theta1_grad(:); Theta2_grad(:)];
endfunction
function numericalGrad = computeNumericalGradient(J, nn_params)
m = size(nn_params, 1);
mask = eye(m, m);
e = 1e-4;
nn_params = repmat(nn_params, 1, m);
deviation = mask .* e;
iter = 1:m;
diff1 = nn_params - deviation;
loss1 = arrayfun(@(i)J(diff1(:, i)), iter, 'UniformOutput', false); % https://stackoverflow.com/a/10898162/1835650
loss1 = [loss1{:}] .* mask;
diff2 = nn_params + deviation;
loss2 = arrayfun(@(i)J(diff2(:, i)), iter, 'UniformOutput', false);
loss2 = [loss2{:}] .* mask;
numericalGrad = sum((loss2 - loss1), 2);
numericalGrad = numericalGrad ./ (2 * e);
endfunction
function checkGradients(nn_params, costFunction)
[cost, grad] = costFunction(nn_params);
numericalGrad = computeNumericalGradient(costFunction, nn_params);
diff = norm(numericalGrad - grad) / norm(numericalGrad + grad);
checkSign = "𐄂";
if (diff < 1e-9)
checkSign = "✓";
endif
printf(['\nGradients check difference(less than 1e-9).: %g (%s)\n'], diff, checkSign);
endfunction
function [nn_params, cost, info, output, grad, hess] = fit_nn(X, y, epoch, maxIter, input_layer_size, hidden_layer_size, num_labels, nn_params, lambda, checkGradientsFunc)
options = optimset('GradObj', 'on', 'MaxIter', maxIter, "Display", 'iter');
costFunction = @(p) nnCostFunction(p, input_layer_size, hidden_layer_size, num_labels, X, y, lambda);
if exist('checkGradientsFunc', 'var') && epoch < 2
checkGradientsFunc(nn_params, costFunction);
endif
[nn_params, cost, info, output, grad, hess] = fminunc(costFunction, nn_params, options);
endfunction
function [nn_params, cost, train_cost, validation_cost] = fit_step(X, y, X_cv, y_cv, EPOCHS, epoch, batch_iter, batch_start, batch_end, batch_size, maxIter, input_layer_size, hidden_layer_size, num_labels, nn_params, lambda, checkGradientsFunc)
[nn_params, cost, ~] = fit_nn(X, y, epoch, maxIter, input_layer_size, hidden_layer_size, num_labels, nn_params, lambda, checkGradientsFunc);
[~, train_cost, ~] = fit_nn(X, y, epoch, maxIter, input_layer_size, hidden_layer_size, num_labels, nn_params, 0);
[~, validation_cost, ~] = fit_nn(X_cv, y_cv, epoch, maxIter, input_layer_size, hidden_layer_size, num_labels, nn_params, 0);
printf('Epoch: %6i/%i | batch: %6i | start: %6i | end: %6i | size: %6i | cost: %4.6f | train cost: %4.6f | validation cost: %4.6f \n', epoch, EPOCHS, batch_iter, batch_start, batch_end, batch_size, cost, train_cost, validation_cost);
endfunction
function [X_shuffled, y_shuffled] = shuffle(X, y)
shuffle_order = randperm(size(X, 1));
X_shuffled = X(shuffle_order, :);
y_shuffled = y(shuffle_order);
endfunction
function main()
load('minist.mat');
m = size(X, 1);
sel = randperm(size(X, 1)); % Randomly select 100 data points to display
sel = sel(1:100);
displayData(X(sel, :));
input_layer_size = 400; % 20x20 Input Images of Digits
hidden_layer_size = 25; % 25 hidden units
num_labels = 10; % 10 labels, from 1 to 10 (note that we have mapped "0" to label 10)
lambda = 1;
maxIter = 50;
batch_size = 1024;
EPOCHS = 32;
sp = (m * 0.8);
X_train = X(1:sp, :);
y_train = y(1:sp);
X_cv = X(m - sp:end, :);
y_cv = y(m - sp:end);
nn_params_1 = randInitializeWeights(input_layer_size, hidden_layer_size);
nn_params_2 = randInitializeWeights(hidden_layer_size, num_labels);
nn_params = [nn_params_1(:); nn_params_2(:)]; % Flatten parameters
total_cost = 0;
total_train_cost = 0;
total_validation_cost = 0;
for epoch = 1:EPOCHS
batch_cost = 0;
batch_train_cost = 0;
batch_validation_cost = 0;
batch_iter = 1;
batch_start = 1;
[X_train, y_train] = shuffle(X_train, y_train);
while true
X_batch = [];
y_batch = [];
batch_end = batch_start + batch_size;
if batch_end > sp
X_batch = X_train(batch_start:end, :);
y_batch = y_train(batch_start:end);
else
X_batch = X_train(batch_start:batch_end, :);
y_batch = y_train(batch_start:batch_end);
endif
if size(X_batch, 1) == 0
break;
endif
[nn_params, cost, train_cost, validation_cost] = fit_step(X_batch, y_batch, X_cv, y_cv, EPOCHS, epoch, batch_iter, batch_start, batch_end, batch_size, maxIter, input_layer_size, hidden_layer_size, num_labels, nn_params, lambda, @checkGradients);
batch_cost = batch_cost + cost;
batch_train_cost = batch_train_cost + train_cost;
batch_validation_cost = batch_validation_cost + validation_cost;
batch_start = batch_start + batch_size;
batch_iter = batch_iter + 1;
if batch_start > sp
break;
endif
endwhile
total_steps = max(1, batch_iter - 1);
batch_cost = (batch_cost / total_steps);
batch_train_cost = (batch_train_cost / total_steps);
batch_validation_cost = (batch_validation_cost / total_steps);
printf('\n#Mean [cost: %4.6f train cost: %4.6f validation cost: %4.6f] \n\n', batch_cost, batch_train_cost, batch_validation_cost);
total_cost = total_cost + batch_cost;
total_train_cost = total_train_cost + batch_train_cost;
total_validation_cost = total_validation_cost + batch_validation_cost;
endfor
printf('\n\n##Total: cost: %4.6f | train cost: %4.6f | validation cost: %4.6f \n\n', total_cost / EPOCHS, total_train_cost / EPOCHS, total_validation_cost / EPOCHS);
endfunction
main();
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment