Skip to content

Instantly share code, notes, and snippets.

@mick001
Created August 14, 2018 19:11
Show Gist options
  • Save mick001/d5786de2788245c606cc77abac273c50 to your computer and use it in GitHub Desktop.
Save mick001/d5786de2788245c606cc77abac273c50 to your computer and use it in GitHub Desktop.
Support vector machine classifier example in R for Olivetti faces dataset.
#-------------------------------------------------------------------------------
# Setup
require(dplyr)
require(e1071)
load(file="images_formatted.Rdata")
D <- out; rm(out)
# Scale the data. Note that scale returns a matrix
D <- scale(D)
# Reproducibility
set.seed(500)
# Index to be sampled for train/test split
index <- 1:nrow(D)
# Number of cross validation runs
K <- 20
#-------------------------------------------------------------------------------
# SVC model on real data
# Assign variable names
colnames(D) <- paste("pixel_", 1:4096, sep = "")
# Convert images to dataframe and add label information as a factor
D <- D %>% tbl_df() %>% mutate(label = as.factor(y_df))
accuracy_first <- list()
for(i in 1:K)
{
# Split data into a train and test set.
testindex <- sample(index, trunc(length(index)/3))
# Test set. 1/3 of full data.
testset <- D[testindex, ]
# Train set. 2/3 of full data.
trainset <- D[-testindex, ]
# Solution to test set
actual_sol <- testset %>% select(label) %>% pull()
# SVC model on real data: train
svc.model <- svm(label ~., data = trainset, cost = 1, kernel = "linear")
# SVC model on real data: classify test set
svc.pred <- predict(svc.model, testset[, -4097])
# Accuracy of SVC on real data
accuracy_first[[i]] <- sum(svc.pred == actual_sol)/length(actual_sol)
# Print accuracy of current iteration
print(accuracy_first[[i]])
}
# Average accuracy of the model
print(mean(as.numeric(accuracy_first)))
rm(testset, trainset, actual_sol, svc.model, svc.pred, i)
#-------------------------------------------------------------------------------
# SVC model on PCA (30 principal components)
# Run PCA on data
pca_faces <- D %>% select(-label) %>% as.matrix() %>% prcomp()
plot((cumsum(pca_faces$sdev^2)/sum(pca_faces$sdev^2))[1:30], type="o", xlab = "Eigenvalue #", ylab = "% of total variance", main = "Percentage of total variance explained", col = "blue")
plot((pca_faces$sdev^2)[1:30], type="o", xlab = "Eigenvalue #", ylab = "Magnitude", main = "Magnitude of eigenvalues", col = "green")
# % of total variance explained with 30 components
(cumsum(pca_faces$sdev^2)/sum(pca_faces$sdev^2))[30]
accuracy_second <- list()
for(i in 1:K)
{
# Split data into a train and test set
testindex <- sample(index, trunc(length(index)/3))
# Test set from PCA
testset <- as.matrix(D[testindex, -4097]) %*% pca_faces$rotation[, 1:30]
colnames(testset) <- paste("PC_", 1:30, sep="")
# Solutions to test set
actual_sol <- D[testindex, 4097] %>% pull(label)
# Train set from PCA
trainset <- as.matrix(D[-testindex, -4097]) %*% pca_faces$rotation[, 1:30]
trainset <- as.data.frame(trainset)
colnames(trainset) <- paste("PC_", 1:30, sep="")
# Add label information to test set
trainset$label <- D[-testindex, 4097] %>% pull(label)
# SVC model on PCA data
svc.model <- svm(label ~., data = trainset, cost = 1, kernel = "linear")
svc.pred <- predict(svc.model, testset)
# Accuracy of SVM on real data
accuracy_second[[i]] <- sum(svc.pred == actual_sol)/length(actual_sol)
print(accuracy_second[[i]])
}
print(mean(as.numeric(accuracy_second)))
#-------------------------------------------------------------------------------
# Comparison between the two accuracies
results <- data.frame(data_model = as.numeric(accuracy_first),
pca_model = as.numeric(accuracy_second))
results %>% summarise_all(mean)
results %>% summarise_all(sd)
require(reshape2)
require(ggplot2)
ggplot(melt(results), aes(x = variable, y = value, fill = variable)) +
geom_boxplot() +
ggtitle("Comparison between accuracies of the two models") +
ylab("Accuracy") +
xlab("Model")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment