Last active
August 13, 2018 14:22
-
-
Save mick001/fc9dbec00bc9a76708814ba468bef590 to your computer and use it in GitHub Desktop.
PCA on Olivetti faces dataset in R
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#------------------------------------------------------------------------------- | |
# Perform PCA on the data | |
# Step 1: scale data | |
# Scale as follows: mean equal to 0, stdv equal to 1 | |
D <- scale(D) | |
# Step 2: calculate covariance matrix | |
A <- cov(D) | |
A_ <- t(D) %*% D / (nrow(D)-1) | |
# Note that the two matrices are the same | |
max(A - A_) # Effectively zero | |
rm(A_) | |
# Note: diagonal elements are variances of images, off diagonal are covariances between images | |
identical(var(D[, 1]), A[1,1]) | |
identical(var(D[, 2]), A[2,2]) | |
identical(cov(D[, 1], D[, 2]), A[1,2]) | |
# Step 3: calculate eigenvalues and eigenvectors | |
# Calculate the largest 20 eigenvalues and corresponding eigenvectors | |
eigs <- rARPACK::eigs(A, 20, which = "LM") | |
# Eigenvalues | |
eigenvalues <- eigs$values | |
# Eigenvectors (also called loadings or "rotation" in R prcomp function: i.e. prcomp(A)$rotation) | |
eigenvectors <- eigs$vectors | |
# First 20 eigenvalues dominate | |
plot(1:20, eigenvalues, type="o", log = "y", main="Magnitude of the 20 biggest eigenvalues", xlab="Eigenvalue #", ylab="Magnitude") | |
# Check that the 20 biggest eigenvalues account for 76.5 % of the total variance in the dataset | |
# sum(eigenvalues)/sum(eigen(A)$values) | |
# OUT: [1] 0.7649131 | |
# New variables (the principal components, also called scores, and called x in R prcomp function: i.e. prcomp(A)$x) | |
D_new <- D %*% eigenvectors |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment