Last active
January 8, 2022 22:01
-
-
Save XinyueZ/ea5623e30b2f6eb1001acaa1265e7831 to your computer and use it in GitHub Desktop.
Softmax regression
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| disp("multi-class classification with softmax regression from scratch......"); | |
| function [h, display_array] = displayData(X, example_width) | |
| %DISPLAYDATA Display 2D data in a nice grid | |
| % [h, display_array] = DISPLAYDATA(X, example_width) displays 2D data | |
| % stored in X in a nice grid. It returns the figure handle h and the | |
| % displayed array if requested. | |
| % Set example_width automatically if not passed in | |
| if ~exist('example_width', 'var') || isempty(example_width) | |
| example_width = round(sqrt(size(X, 2))); | |
| end | |
| % Gray Image | |
| colormap(gray); | |
| % Compute rows, cols | |
| [m n] = size(X); | |
| example_height = (n / example_width); | |
| % Compute number of items to display | |
| display_rows = floor(sqrt(m)); | |
| display_cols = ceil(m / display_rows); | |
| % Between images padding | |
| pad = 1; | |
| % Setup blank display | |
| display_array =- ones(pad + display_rows * (example_height + pad), ... | |
| pad + display_cols * (example_width + pad)); | |
| % Copy each example into a patch on the display array | |
| curr_ex = 1; | |
| for j = 1:display_rows | |
| for i = 1:display_cols | |
| if curr_ex > m, | |
| break; | |
| end | |
| % Copy the patch | |
| % Get the max value of the patch | |
| max_val = max(abs(X(curr_ex, :))); | |
| display_array(pad + (j - 1) * (example_height + pad) + (1:example_height), ... | |
| pad + (i - 1) * (example_width + pad) + (1:example_width)) = ... | |
| reshape(X(curr_ex, :), example_height, example_width) / max_val; | |
| curr_ex = curr_ex + 1; | |
| end | |
| if curr_ex > m, | |
| break; | |
| end | |
| end | |
| % Display Image | |
| h = imagesc(display_array, [-1 1]); | |
| % Do not show axis | |
| axis image off | |
| drawnow; | |
| endfunction | |
| function g = sigmoid(z) | |
| g = 1.0 ./ (1.0 + exp(-z)); | |
| endfunction | |
| function g = sigmoidGradient(z) | |
| g = sigmoid(z) .* (1 - sigmoid(z)); | |
| endfunction | |
| function W = randInitializeWeights(L_in, L_out) | |
| epsilon_init = sqrt(6) / sqrt(L_in + L_out); | |
| W = rand(L_out, 1 + L_in) * 2 * epsilon_init - epsilon_init; | |
| endfunction | |
| function g = softmax(z) | |
| g = exp(z) ./ sum(exp(z), 2); | |
| endfunction | |
| function [J grad] = nnCostFunction(nn_params, ... | |
| input_layer_size, ... | |
| hidden_layer_size, ... | |
| num_labels, ... | |
| X, y, lambda) | |
| Theta1 = reshape(nn_params(1:hidden_layer_size * (input_layer_size + 1)), ... | |
| hidden_layer_size, (input_layer_size + 1)); | |
| Theta2 = reshape(nn_params((1 + (hidden_layer_size * (input_layer_size + 1))):end), ... | |
| num_labels, (hidden_layer_size + 1)); | |
| m = size(X, 1); | |
| J = 0; | |
| Theta1_grad = zeros(size(Theta1)); | |
| Theta2_grad = zeros(size(Theta2)); | |
| X = [ones(m, 1) X]; | |
| y = (y == 1:num_labels); | |
| Z_2 = X * Theta1'; | |
| L_2 = sigmoid(Z_2); | |
| L_2 = [ones(m, 1) L_2]; | |
| Z_3 = L_2 * Theta2'; | |
| L_3 = softmax(Z_3); | |
| J = (1 / m) * sum(sum(-y .* log(L_3), 2), 1); | |
| Loss_3 = L_3 - y; | |
| Loss_2 = Loss_3 * Theta2(:, 2:end) .* sigmoidGradient(Z_2); | |
| Theta2_grad = Loss_3' * L_2; | |
| Theta1_grad = Loss_2' * X; | |
| Theta2_zero_bias = Theta2; | |
| Theta2_zero_bias(:, 1) = 0; | |
| Theta1_zero_bias = Theta1; | |
| Theta1_zero_bias(:, 1) = 0; | |
| J_reg = 0; | |
| J_reg = J_reg + sum(sum(Theta1_zero_bias .* Theta1_zero_bias)); | |
| J_reg = J_reg + sum(sum(Theta2_zero_bias .* Theta2_zero_bias)); | |
| J_reg = (lambda / (2 * m)) * J_reg; | |
| J = J + J_reg; | |
| Theta2_grad = (1 / m) * (Theta2_grad + lambda * Theta2_zero_bias); | |
| Theta1_grad = (1 / m) * (Theta1_grad + lambda * Theta1_zero_bias); | |
| % Flatten gradients | |
| grad = [Theta1_grad(:); Theta2_grad(:)]; | |
| endfunction | |
| function numericalGrad = computeNumericalGradient(J, nn_params) | |
| m = size(nn_params, 1); | |
| mask = eye(m, m); | |
| e = 1e-4; | |
| nn_params = repmat(nn_params, 1, m); | |
| deviation = mask .* e; | |
| iter = 1:m; | |
| diff1 = nn_params - deviation; | |
| loss1 = arrayfun(@(i)J(diff1(:, i)), iter, 'UniformOutput', false); % https://stackoverflow.com/a/10898162/1835650 | |
| loss1 = [loss1{:}] .* mask; | |
| diff2 = nn_params + deviation; | |
| loss2 = arrayfun(@(i)J(diff2(:, i)), iter, 'UniformOutput', false); | |
| loss2 = [loss2{:}] .* mask; | |
| numericalGrad = sum((loss2 - loss1), 2); | |
| numericalGrad = numericalGrad ./ (2 * e); | |
| endfunction | |
| function checkGradients(nn_params, costFunction) | |
| [cost, grad] = costFunction(nn_params); | |
| numericalGrad = computeNumericalGradient(costFunction, nn_params); | |
| diff = norm(numericalGrad - grad) / norm(numericalGrad + grad); | |
| checkSign = "𐄂"; | |
| if (diff < 1e-9) | |
| checkSign = "✓"; | |
| endif | |
| printf(['\nGradients check difference(less than 1e-9).: %g (%s)\n'], diff, checkSign); | |
| endfunction | |
| function [nn_params, cost, info, output, grad, hess] = fit_nn(X, y, epoch, maxIter, input_layer_size, hidden_layer_size, num_labels, nn_params, lambda, checkGradientsFunc) | |
| options = optimset('GradObj', 'on', 'MaxIter', maxIter, "Display", 'iter'); | |
| costFunction = @(p) nnCostFunction(p, input_layer_size, hidden_layer_size, num_labels, X, y, lambda); | |
| if exist('checkGradientsFunc', 'var') && epoch < 2 | |
| checkGradientsFunc(nn_params, costFunction); | |
| endif | |
| [nn_params, cost, info, output, grad, hess] = fminunc(costFunction, nn_params, options); | |
| endfunction | |
| function [nn_params, cost, train_cost, validation_cost] = fit_step(X, y, X_cv, y_cv, EPOCHS, epoch, batch_iter, batch_start, batch_end, batch_size, maxIter, input_layer_size, hidden_layer_size, num_labels, nn_params, lambda, checkGradientsFunc) | |
| [nn_params, cost, ~] = fit_nn(X, y, epoch, maxIter, input_layer_size, hidden_layer_size, num_labels, nn_params, lambda, checkGradientsFunc); | |
| [~, train_cost, ~] = fit_nn(X, y, epoch, maxIter, input_layer_size, hidden_layer_size, num_labels, nn_params, 0); | |
| [~, validation_cost, ~] = fit_nn(X_cv, y_cv, epoch, maxIter, input_layer_size, hidden_layer_size, num_labels, nn_params, 0); | |
| printf('Epoch: %6i/%i | batch: %6i | start: %6i | end: %6i | size: %6i | cost: %4.6f | train cost: %4.6f | validation cost: %4.6f \n', epoch, EPOCHS, batch_iter, batch_start, batch_end, batch_size, cost, train_cost, validation_cost); | |
| endfunction | |
| function [X_shuffled, y_shuffled] = shuffle(X, y) | |
| shuffle_order = randperm(size(X, 1)); | |
| X_shuffled = X(shuffle_order, :); | |
| y_shuffled = y(shuffle_order); | |
| endfunction | |
| function main() | |
| load('minist.mat'); | |
| m = size(X, 1); | |
| sel = randperm(size(X, 1)); % Randomly select 100 data points to display | |
| sel = sel(1:100); | |
| displayData(X(sel, :)); | |
| input_layer_size = 400; % 20x20 Input Images of Digits | |
| hidden_layer_size = 25; % 25 hidden units | |
| num_labels = 10; % 10 labels, from 1 to 10 (note that we have mapped "0" to label 10) | |
| lambda = 1; | |
| maxIter = 50; | |
| batch_size = 1024; | |
| EPOCHS = 32; | |
| sp = (m * 0.8); | |
| X_train = X(1:sp, :); | |
| y_train = y(1:sp); | |
| X_cv = X(m - sp:end, :); | |
| y_cv = y(m - sp:end); | |
| nn_params_1 = randInitializeWeights(input_layer_size, hidden_layer_size); | |
| nn_params_2 = randInitializeWeights(hidden_layer_size, num_labels); | |
| nn_params = [nn_params_1(:); nn_params_2(:)]; % Flatten parameters | |
| total_cost = 0; | |
| total_train_cost = 0; | |
| total_validation_cost = 0; | |
| for epoch = 1:EPOCHS | |
| batch_cost = 0; | |
| batch_train_cost = 0; | |
| batch_validation_cost = 0; | |
| batch_iter = 1; | |
| batch_start = 1; | |
| [X_train, y_train] = shuffle(X_train, y_train); | |
| while true | |
| X_batch = []; | |
| y_batch = []; | |
| batch_end = batch_start + batch_size; | |
| if batch_end > sp | |
| X_batch = X_train(batch_start:end, :); | |
| y_batch = y_train(batch_start:end); | |
| else | |
| X_batch = X_train(batch_start:batch_end, :); | |
| y_batch = y_train(batch_start:batch_end); | |
| endif | |
| if size(X_batch, 1) == 0 | |
| break; | |
| endif | |
| [nn_params, cost, train_cost, validation_cost] = fit_step(X_batch, y_batch, X_cv, y_cv, EPOCHS, epoch, batch_iter, batch_start, batch_end, batch_size, maxIter, input_layer_size, hidden_layer_size, num_labels, nn_params, lambda, @checkGradients); | |
| batch_cost = batch_cost + cost; | |
| batch_train_cost = batch_train_cost + train_cost; | |
| batch_validation_cost = batch_validation_cost + validation_cost; | |
| batch_start = batch_start + batch_size; | |
| batch_iter = batch_iter + 1; | |
| if batch_start > sp | |
| break; | |
| endif | |
| endwhile | |
| total_steps = max(1, batch_iter - 1); | |
| batch_cost = (batch_cost / total_steps); | |
| batch_train_cost = (batch_train_cost / total_steps); | |
| batch_validation_cost = (batch_validation_cost / total_steps); | |
| printf('\n#Mean [cost: %4.6f train cost: %4.6f validation cost: %4.6f] \n\n', batch_cost, batch_train_cost, batch_validation_cost); | |
| total_cost = total_cost + batch_cost; | |
| total_train_cost = total_train_cost + batch_train_cost; | |
| total_validation_cost = total_validation_cost + batch_validation_cost; | |
| endfor | |
| printf('\n\n##Total: cost: %4.6f | train cost: %4.6f | validation cost: %4.6f \n\n', total_cost / EPOCHS, total_train_cost / EPOCHS, total_validation_cost / EPOCHS); | |
| endfunction | |
| main(); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment