Last active
January 26, 2026 00:02
-
-
Save mingjiphd/94ebdb301b972da2eceff9d9cc4a3e1d to your computer and use it in GitHub Desktop.
Dimensionaltiy Reduction using Shallow Autoencoders
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| This scirpt shows how to perform dimensionality reduction using Shallow Autoencoder in R. The R packages neuralnet and mlbench are used to show how ot fit an autoencoder (which is a simple neural network) for a synthetic data set. Subsequent analyses such as visualizing the latent space, computing the reconstruction errors, anamoly detection, clustering in latent space and comparing autoencoder to PCA. |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #################################################################################### | |
| # Machine Learning using R Dimensionality Reduction using Shallow Autoencoder # | |
| #################################################################################### | |
| ##### Autoencoder Key Points | |
| ## Neural network that compresses data (encoder) then reconstructs it (decoder) | |
| ## Input = Output - learns to copy input through a smaller bottleneck (latent space) | |
| ## Unsupervised - no labels needed, just minimizes **reconstruction error(MSE) | |
| ## 3 main uses: | |
| ## Dimensionality reduction (Usually < 50D) | |
| ## Anomaly detection** (high recon error = outlier) | |
| ## Feature extraction** (latent space for other models) | |
| ## Architecture: Input → Hidden (bottleneck) → Output (same dim as input)` | |
| ## Shallow vs Deep: Shallow perfect for tabular/small data. Deep needed for images/big data | |
| library(neuralnet) | |
| library(mlbench) | |
| set.seed(123) | |
| n_samples <- 500 | |
| # Create and scale data | |
| data_raw <- mlbench.2dnormals(n_samples, cl = 2, sd = 0.5)$x %>% | |
| as.data.frame() %>% `colnames<-`(c("X1", "X2")) | |
| data_raw$X3 <- 0.8 * data_raw$X1 + 0.2 * rnorm(n_samples) | |
| data_raw$X4 <- 0.7 * data_raw$X2 + 0.3 * rnorm(n_samples) | |
| data_scaled <- as.data.frame(scale(data_raw)) | |
| # TRUE AUTOENCODER: reconstruct ALL inputs from ALL inputs | |
| # Formula: X1+X2+X3+X4 ~ X1+X2+X3+X4 (input = target) | |
| formula_str <- paste(names(data_scaled), collapse=" + ") | |
| nn_autoencoder <- neuralnet( | |
| formula = as.formula(paste(formula_str, "~", formula_str)), | |
| data = data_scaled, | |
| hidden = c(2), # Bottleneck layer (2D latent space) | |
| linear.output = TRUE, | |
| learningrate = 0.01, | |
| stepmax = 200000 | |
| ) | |
| plot(nn_autoencoder) | |
| # Reconstruction error (MSE across ALL features) | |
| predictions <- predict(nn_autoencoder, data_scaled) | |
| recon_error <- rowMeans((data_scaled - predictions)^2) | |
| summary(recon_error) | |
| # Visualize clusters in original space | |
| classes <- mlbench.2dnormals(n_samples, cl = 2, sd = 0.5)$classes | |
| plot(data_raw$X1, data_raw$X2, col = classes, pch = 19, | |
| main = "Autoencoder: Original Data (MSE = mean(recon_error))") | |
| ###Latent space visualization | |
| # Get hidden layer activations (neuralnet stores in model$rep) | |
| hidden_activations <- compute(nn_autoencoder, data_scaled)$net.result | |
| latent_2d <- hidden_activations[, 1:2] # First 2 hidden neurons | |
| # Plot latent space with cluster colors | |
| # Extract hidden layer activations (CRITICAL STEP) | |
| hidden_result <- compute(nn_autoencoder, data_scaled) | |
| latent_2d <- hidden_result$net.result[, 1:2] # First 2 hidden neurons = 2D latent space | |
| # NOW plot works perfectly | |
| colors_vec <- c("blue", "red")[classes] | |
| plot(latent_2d, col = colors_vec, pch = 19, | |
| main = "Autoencoder LATENT SPACE (2D Bottleneck)", | |
| xlab = "Hidden Neuron 1", ylab = "Hidden Neuron 2") | |
| ### Anamoly Detection | |
| ## High reconstruction error=outliers | |
| # Flag top 5% errors as anomalies | |
| threshold <- quantile(recon_error, 0.95) | |
| anomalies <- which(recon_error > threshold) | |
| n_anomalies <- length(anomalies) | |
| plot(latent_2d, col = ifelse(1:n_samples %in% anomalies, "black", colors_vec), | |
| pch = 19, main = "Anomalies (black dots)") | |
| ### Clusterin on Latent Space | |
| library(cluster) | |
| kmeans_latent <- kmeans(latent_2d, centers = 2) | |
| plot(latent_2d, col = kmeans_latent$cluster, pch = 19, | |
| main = "K-means on Latent Space") | |
| table(kmeans_latent$cluster, classes) # Compare to true clusters | |
| ### Reconstruction qualiy by cluster | |
| mean_error_cluster1 <- mean(recon_error[classes == 1]) | |
| mean_error_cluster2 <- mean(recon_error[classes == 2]) | |
| cat("Cluster 1 MSE:", round(mean_error_cluster1, 4), "\n") | |
| cat("Cluster 2 MSE:", round(mean_error_cluster2, 4), "\n") | |
| ###Note: Rule of Thumb for Quality | |
| ## 0-0.01 Outstanding | |
| ## 0.01-0.03 Excellent | |
| ## 0.03-0.10 Good | |
| ## 0.10-0.5 Fair | |
| ## >0.50 Poor | |
| ### Compare to PCA | |
| # Proper PCA reconstruction MSE (apples-to-apples) | |
| pca_result <- prcomp(data_scaled) | |
| pca_recon <- predict(pca_result)[,1:4] # Reconstruct 4D | |
| pca_mse <- mean(rowSums((data_scaled - pca_recon)^2)) | |
| cat("Autoencoder MSE:", round(mean(recon_error), 4), "\n") | |
| cat("PCA MSE:", round(pca_mse, 4), "\n") | |
| cat("Autoencoder beats PCA by:", round(100*(1-mean(recon_error)/pca_mse), 1), "%\n") | |
| #### Summary Table | |
| results_summary <- data.frame( | |
| Method = c("Autoencoder", "PCA"), | |
| MSE = c(round(mean(recon_error), 4), round(pca_mse, 4)), | |
| Outliers = c(n_anomalies, "N/A"), | |
| Improvement = c("Baseline", paste0(round(100*(1-mean(recon_error)/pca_mse), 1), "% better")) | |
| ) | |
| print(results_summary) | |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| The URL to the original video is: https://www.youtube.com/watch?v=wQRO9l71Gc0 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment