Created
October 3, 2012 17:43
-
-
Save KarlHerler/3828529 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
function [X_norm, mu, sigma] = featureNormalize(X) | |
%FEATURENORMALIZE Normalizes the features in X | |
% FEATURENORMALIZE(X) returns a normalized version of X where | |
% the mean value of each feature is 0 and the standard deviation | |
% is 1. This is often a good preprocessing step to do when | |
% working with learning algorithms. | |
mu = mean(X); | |
X_norm = bsxfun(@minus, X, mu); | |
sigma = std(X_norm); | |
X_norm = bsxfun(@rdivide, X_norm, sigma); | |
% ============================================================ | |
end |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
% Data analysis and knowledge discovery ex1 | |
% Load the dataset | |
dataset = dlmread("iris_matrixed.dat", " "); | |
features = ['Sepal length';'Sepal width';'Petal length';'Pedal width';'Species']; | |
% Public: Strugers' formula for bin size computation | |
% n - the cardinality of the sample | |
% | |
% returns the number of bins | |
function k = strugers(n) | |
k = ceil(log2(n+1)); | |
end | |
% Public: Scotts' normal reference formula for bin size computation | |
% n - the cardinality of the sample | |
% | |
% returns the number of bins | |
function h = scotts(n) | |
h = ceil(3.5 * sqrt(n) * n^(-1/3)); | |
end | |
% Public: Square-root choice formula for bin size computation | |
% n - the cardinality of the sample | |
% | |
% returns the number of bins | |
function k = square_choice(n) | |
k = ceil(sqrt(n)); | |
end | |
% Public: Freedman–Diaconis' choice formula for bin size computation | |
% n - the cardinality of the sample | |
% | |
% returns the number of bins | |
function h = freedman_diaconis(n, vect) | |
h = 2 * iqr(vect) * n^(-1/3); | |
end | |
% Public: Strugers' formula for bin size computation | |
% n - the cardinality of the sample | |
% | |
% returns the number of bins | |
function r = l2_risk_min(n) | |
r = n; | |
end | |
function k = number_of_bins(vect, h) | |
k = ceil((max(vect)-min(vect))/h); | |
end | |
n = length(dataset); % the size of the dataset | |
m_labels = length(dataset(1, :)); % the number of features in the dataset incl labels | |
m = m_labels - 1; % the number of features in the dataset | |
disp(sprintf('Size of the dataset: n = %d and m = %d', n, m));disp(""); | |
disp('Bin sizes with with various choice strategies');disp(""); | |
stru = strugers(n) | |
scot = scotts(n) | |
sqch = square_choice(n) | |
free = number_of_bins(dataset(:, 3), freedman_diaconis(n, dataset(:, 3))) | |
function histy(i, h, method, dataset, features) | |
pause; | |
disp(sprintf('Feature: %s - method: %s', features(i,:), method)); | |
hist(dataset(:, i), h) | |
title(sprintf('Feature: %s - method: %s', features(i,:), method)) | |
end | |
%for i = 1:m | |
% histy(i, stru, "Strugers", dataset, features) | |
% histy(i, scot, "Scotts", dataset, features) | |
% histy(i, sqch, "Square-root choice", dataset, features) | |
% histy(i, free, "Freedman–Diaconis choice", dataset, features) | |
% disp("") | |
% pause; | |
%end | |
labelless = dataset(:,[1:4]); | |
%% Task 2 - boxplots | |
%boxplot ({dataset(:, 1), dataset(:, 2), dataset(:, 3), dataset(:, 4)}, 1) | |
%set(gca,'XTickLabel', {'', 'Sepal length', 'Sepal width', 'Petal length', 'Pedal width'}) | |
%pause; | |
%% Task 3 - scatterplot | |
%plotmatrix(labelless) | |
%pause; | |
%% Task 4 PCA | |
[normalized_dataset, mu, sigma] = featureNormalize(labelless); % We must normalize the dataset | |
[m, n] = size(normalized_dataset); % grab the size | |
sigma = (normalized_dataset'*normalized_dataset)/m %grab the covariance matrix of the dataset | |
% perform Singular value decomposition on the normalized dataset in order to extract | |
% eigenvectors of the covariance matrix. | |
[U, S, V] = svd(sigma); | |
Z = labelless*U(:, 1:2); % Project the dataset over two features | |
scatter(Z(:,1), Z(:, 2)) | |
%plot(Z) | |
%% Task 5 - 2D MDS | |
%% tehee I haven't done this | |
%% Task 6 - Paralell coords | |
%plot(normalized_dataset') | |
% do some more magic with the labling | |
%% Task 7 - Spearmans and Kendalls | |
disp("Spearman's rho"); | |
spearman(labelless) | |
disp("Kendalls's tau"); | |
kendall(labelless) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment