Created
January 28, 2026 02:18
-
-
Save mingjiphd/93429f51631e262a2333ea79e7615839 to your computer and use it in GitHub Desktop.
Machine Learning using R Dimensionality Reduction using UMAP
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| This R script is a step by step demonstration on how to implement the Uniform Manifold Approximation and Projection (UMAP) using R. Two R packages umap and uwot were demonstrated for both numeric data and mixed type data. | |
| A step by step video demonstration can be found at: https://youtu.be/Cf_pKrsDKi8?si=gHJp2bXPPzVa8M12 |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| ################################################################################# | |
| # Machine Learning using R Dimensionality Reduction by using UMAP # | |
| ################################################################################# | |
| ############## Brief Overview of UMAP (Uniform Manifold Approximation and Projection | |
| ### UMAP excels as a dimensionality reduction technique by preserving | |
| ## both local neighborhood structure and global data topology simultaneously. | |
| ### Dual structure preservation: | |
| # Maintains local clusters while capturing broad manifold relationships, | |
| # unlike t-SNE (local-only) or PCA (linear global-only). | |
| ### Computational efficiency: | |
| # Scales to millions of points using approximate nearest neighbors, much faster | |
| # than t-SNE for large datasets. | |
| ### Flexible embedding dimensions: | |
| # Works for 2D visualization or higher-D preprocessing, | |
| # unlike t-SNE's visualization limitation. | |
| ### Topology-aware: Mathematical foundation in Riemannian manifolds and fuzzy simplicial | |
| # sets preserves holes, loops, and connectivity. | |
| ### Tunable parameters: `n_neighbors` balances local/global focus; | |
| # `min_dist` controls embedding tightness for interpretable control. | |
| ### UMAP serves visualization, clustering preprocessing, and feature engineering across | |
| # genomics, NLP, and images. | |
| #### Generate a data set | |
| set.seed(1222026) | |
| n <- 1000 # observations | |
| p <- 50 # dimensions | |
| # Cluster 1: centered at origin | |
| cluster1 <- matrix(rnorm(n * p, 0, 1), n, p) | |
| # Cluster 2: shifted and scaled | |
| cluster2 <- matrix(rnorm(n * p, 5, 0.5), n, p) | |
| # Cluster 3: correlated features | |
| cluster3 <- matrix(rnorm(n * p, -3, 1.5), n, p) | |
| for (i in 1:n) { | |
| cluster3[i, ] <- cluster3[i, ] + seq(-2, 2, length.out = p) # linear trend | |
| } | |
| # Combine and add labels | |
| data <- rbind(cluster1, cluster2, cluster3) | |
| labels <- factor(rep(1:3, each = n)) | |
| rownames(data) <- paste0("obs_", 1:nrow(data)) | |
| head(data) | |
| ##### Load and run UMAP | |
| ###install.packages("umap") | |
| set.seed(1222026) # Remove random_state entirely | |
| library(umap) | |
| embedding <- umap(data, n_neighbors = 15, n_components = 2, | |
| min_dist = 0.1, metric = "euclidean") | |
| # umap package returns matrix directly (not list) | |
| str(embedding) # 3000 x 2 matrix | |
| # Create plot_df correctly | |
| plot_df <- data.frame(X1 = embedding[,1], X2 = embedding[,2], label = labels) | |
| ggplot(plot_df, aes(X1, X2, color = label)) + | |
| geom_point(size = 1, alpha = 0.7) + | |
| theme_minimal() + | |
| labs(x = "UMAP 1", y = "UMAP 2", title = "UMAP on Synthetic Clusters") + | |
| scale_color_brewer(type = "qual", palette = "Set1") | |
| #### Load and run the R package uwot | |
| #install.packages("uwot") | |
| #library(uwot) | |
| set.seed(1222026) # for reproducibility | |
| # Normalize data (center rows to mean 0 for cosine similarity) | |
| data_norm <- scale(data, center = TRUE, scale = FALSE) | |
| embedding_cosine <- uwot::umap( | |
| data_norm, | |
| n_neighbors = 15, | |
| n_components = 2, | |
| min_dist = 0.1, | |
| metric = "cosine", | |
| n_threads = 0 | |
| ) | |
| ### Visualize Cosin distance | |
| library(ggplot2) | |
| plot_df_cosine <- data.frame(embedding_cosine, label = labels) | |
| ggplot(plot_df_cosine, aes(X1, X2, color = label)) + | |
| geom_point(size = 1, alpha = 0.7) + | |
| theme_minimal() + | |
| labs(x = "UMAP 1 (Cosine)", y = "UMAP 2 (Cosine)", | |
| title = "uwot UMAP with Cosine Metric") + | |
| scale_color_brewer(type = "qual", palette = "Set1") | |
| #### Use uwot on mixed type data | |
| set.seed(1222026) | |
| n <- 300 | |
| # Numeric block 1 (e.g. continuous measurements) | |
| num1 <- matrix(rnorm(n * 5, 0, 1), ncol = 5) | |
| colnames(num1) <- paste0("cont", 1:5) | |
| # Numeric block 2 (e.g. counts, different scale) | |
| num2 <- matrix(rpois(n * 3, lambda = 5), ncol = 3) | |
| colnames(num2) <- paste0("count", 1:3) | |
| # Categorical variables | |
| cat1 <- factor(sample(letters[1:3], n, replace = TRUE)) | |
| cat2 <- factor(sample(c("low", "med", "high"), n, replace = TRUE)) | |
| mixed_df <- data.frame(num1, num2, cat1 = cat1, cat2 = cat2) | |
| str(mixed_df) | |
| set.seed(1222026) # for reproducibility | |
| emb_mixed <- uwot::umap( | |
| X = mixed_df, | |
| n_neighbors = 15, | |
| n_components = 2, | |
| scale = "Z", | |
| metric = list( | |
| euclidean = c("cont1", "cont2", "cont3", "cont4", "cont5"), | |
| manhattan = c("count1", "count2", "count3"), | |
| categorical = "cat1", | |
| categorical = "cat2" | |
| ) | |
| ) | |
| library(ggplot2) | |
| plot_df <- data.frame( | |
| UMAP1 = emb_mixed[, 1], | |
| UMAP2 = emb_mixed[, 2], | |
| cat1 = mixed_df$cat1, | |
| cat2 = mixed_df$cat2 | |
| ) | |
| ggplot(plot_df, aes(UMAP1, UMAP2, color = cat1)) + | |
| geom_point(alpha = 0.7, size = 1.5) + | |
| theme_minimal() + | |
| labs(title = "uwot UMAP on mixed data", color = "cat1") |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| A step by step video demonstration can be found at: https://youtu.be/Cf_pKrsDKi8?si=gHJp2bXPPzVa8M12 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment