mingjiphd · January 28, 2026 02:18
diff --git a/Readme.txt b/Readme.txt
 This R script is a step by step demonstration on how to implement the Uniform Manifold Approximation and Projection (UMAP) using R.  Two R packages umap and uwot were demonstrated for both numeric data and mixed type data.   
 A step by step video demonstration can be found at: https://youtu.be/Cf_pKrsDKi8?si=gHJp2bXPPzVa8M12 
diff --git a/R Script b/R Script
 #################################################################################
 #                             Machine Learning using R   Dimensionality Reduction by using UMAP                                           #
 #################################################################################

 ############## Brief Overview of UMAP (Uniform Manifold Approximation and Projection


 ### UMAP excels as a dimensionality reduction technique by preserving 
 ##  both local neighborhood structure and global data topology simultaneously.  

 ### Dual structure preservation: 
 #      Maintains local clusters while capturing broad manifold relationships, 
 #      unlike t-SNE (local-only) or PCA (linear global-only).  

 ### Computational efficiency: 
 #      Scales to millions of points using approximate nearest neighbors, much faster 
 #      than t-SNE for large datasets.  

 ###  Flexible embedding dimensions: 
 #       Works for 2D visualization or higher-D preprocessing, 
 #       unlike t-SNE's visualization limitation.  


 ###  Topology-aware: Mathematical foundation in Riemannian manifolds and fuzzy simplicial 
 #        sets preserves holes, loops, and connectivity.  

 ###   Tunable parameters: `n_neighbors` balances local/global focus; 
 #                         `min_dist` controls embedding tightness for interpretable control.  

 ###   UMAP serves visualization, clustering preprocessing, and feature engineering across 
 #           genomics, NLP, and images. 


 #### Generate a data set

 set.seed(1222026)
 n <- 1000  # observations
 p <- 50    # dimensions

 # Cluster 1: centered at origin
 cluster1 <- matrix(rnorm(n * p, 0, 1), n, p)

 # Cluster 2: shifted and scaled
 cluster2 <- matrix(rnorm(n * p, 5, 0.5), n, p)

 # Cluster 3: correlated features
 cluster3 <- matrix(rnorm(n * p, -3, 1.5), n, p)
 for (i in 1:n) {
  cluster3[i, ] <- cluster3[i, ] + seq(-2, 2, length.out = p)  # linear trend
 }

 # Combine and add labels
 data <- rbind(cluster1, cluster2, cluster3)
 labels <- factor(rep(1:3, each = n))
 rownames(data) <- paste0("obs_", 1:nrow(data))
 head(data)


 #####  Load and run UMAP
 ###install.packages("umap")
 set.seed(1222026)  # Remove random_state entirely

 library(umap)
 embedding <- umap(data, n_neighbors = 15, n_components = 2, 
                  min_dist = 0.1, metric = "euclidean")

 # umap package returns matrix directly (not list)
 str(embedding)  # 3000 x 2 matrix

 # Create plot_df correctly
 plot_df <- data.frame(X1 = embedding[,1], X2 = embedding[,2], label = labels)

 ggplot(plot_df, aes(X1, X2, color = label)) +
  geom_point(size = 1, alpha = 0.7) +
  theme_minimal() +
  labs(x = "UMAP 1", y = "UMAP 2", title = "UMAP on Synthetic Clusters") +
  scale_color_brewer(type = "qual", palette = "Set1")

 #### Load and run the R package uwot
 #install.packages("uwot")
 #library(uwot)

 set.seed(1222026)  # for reproducibility

 # Normalize data (center rows to mean 0 for cosine similarity)
 data_norm <- scale(data, center = TRUE, scale = FALSE)

 embedding_cosine <- uwot::umap(
  data_norm,
  n_neighbors  = 15,
  n_components = 2,
  min_dist     = 0.1,
  metric       = "cosine",
  n_threads    = 0
 )


 ### Visualize Cosin distance
 library(ggplot2)
 plot_df_cosine <- data.frame(embedding_cosine, label = labels)
 ggplot(plot_df_cosine, aes(X1, X2, color = label)) +
  geom_point(size = 1, alpha = 0.7) +
  theme_minimal() +
  labs(x = "UMAP 1 (Cosine)", y = "UMAP 2 (Cosine)", 
       title = "uwot UMAP with Cosine Metric") +
  scale_color_brewer(type = "qual", palette = "Set1")
   

 #### Use uwot on mixed type data 
 set.seed(1222026)

 n <- 300

 # Numeric block 1 (e.g. continuous measurements)
 num1 <- matrix(rnorm(n * 5, 0, 1), ncol = 5)
 colnames(num1) <- paste0("cont", 1:5)

 # Numeric block 2 (e.g. counts, different scale)
 num2 <- matrix(rpois(n * 3, lambda = 5), ncol = 3)
 colnames(num2) <- paste0("count", 1:3)

 # Categorical variables
 cat1 <- factor(sample(letters[1:3], n, replace = TRUE))
 cat2 <- factor(sample(c("low", "med", "high"), n, replace = TRUE))

 mixed_df <- data.frame(num1, num2, cat1 = cat1, cat2 = cat2)
 str(mixed_df)

 

 set.seed(1222026)  # for reproducibility

 emb_mixed <- uwot::umap(
  X = mixed_df,
  n_neighbors  = 15,
  n_components = 2,
  scale        = "Z",
  metric = list(
    euclidean   = c("cont1", "cont2", "cont3", "cont4", "cont5"),
    manhattan   = c("count1", "count2", "count3"),
    categorical = "cat1",
    categorical = "cat2"
  )
 )

 library(ggplot2)

 plot_df <- data.frame(
  UMAP1 = emb_mixed[, 1],
  UMAP2 = emb_mixed[, 2],
  cat1  = mixed_df$cat1,
  cat2  = mixed_df$cat2
 )

 ggplot(plot_df, aes(UMAP1, UMAP2, color = cat1)) +
  geom_point(alpha = 0.7, size = 1.5) +
  theme_minimal() +
  labs(title = "uwot UMAP on mixed data", color = "cat1")
diff --git a/VideoDemo b/VideoDemo
 A step by step video demonstration can be found at: https://youtu.be/Cf_pKrsDKi8?si=gHJp2bXPPzVa8M12
	This R script is a step by step demonstration on how to implement the Uniform Manifold Approximation and Projection (UMAP) using R. Two R packages umap and uwot were demonstrated for both numeric data and mixed type data.
	A step by step video demonstration can be found at: https://youtu.be/Cf_pKrsDKi8?si=gHJp2bXPPzVa8M12
	#################################################################################
	# Machine Learning using R Dimensionality Reduction by using UMAP #
	#################################################################################

	############## Brief Overview of UMAP (Uniform Manifold Approximation and Projection


	### UMAP excels as a dimensionality reduction technique by preserving
	## both local neighborhood structure and global data topology simultaneously.

	### Dual structure preservation:
	# Maintains local clusters while capturing broad manifold relationships,
	# unlike t-SNE (local-only) or PCA (linear global-only).

	### Computational efficiency:
	# Scales to millions of points using approximate nearest neighbors, much faster
	# than t-SNE for large datasets.

	### Flexible embedding dimensions:
	# Works for 2D visualization or higher-D preprocessing,
	# unlike t-SNE's visualization limitation.


	### Topology-aware: Mathematical foundation in Riemannian manifolds and fuzzy simplicial
	# sets preserves holes, loops, and connectivity.

	### Tunable parameters: `n_neighbors` balances local/global focus;
	# `min_dist` controls embedding tightness for interpretable control.

	### UMAP serves visualization, clustering preprocessing, and feature engineering across
	# genomics, NLP, and images.


	#### Generate a data set

	set.seed(1222026)
	n <- 1000 # observations
	p <- 50 # dimensions

	# Cluster 1: centered at origin
	cluster1 <- matrix(rnorm(n * p, 0, 1), n, p)

	# Cluster 2: shifted and scaled
	cluster2 <- matrix(rnorm(n * p, 5, 0.5), n, p)

	# Cluster 3: correlated features
	cluster3 <- matrix(rnorm(n * p, -3, 1.5), n, p)
	for (i in 1:n) {
	cluster3[i, ] <- cluster3[i, ] + seq(-2, 2, length.out = p) # linear trend
	}

	# Combine and add labels
	data <- rbind(cluster1, cluster2, cluster3)
	labels <- factor(rep(1:3, each = n))
	rownames(data) <- paste0("obs_", 1:nrow(data))
	head(data)


	##### Load and run UMAP
	###install.packages("umap")
	set.seed(1222026) # Remove random_state entirely

	library(umap)
	embedding <- umap(data, n_neighbors = 15, n_components = 2,
	min_dist = 0.1, metric = "euclidean")

	# umap package returns matrix directly (not list)
	str(embedding) # 3000 x 2 matrix

	# Create plot_df correctly
	plot_df <- data.frame(X1 = embedding[,1], X2 = embedding[,2], label = labels)

	ggplot(plot_df, aes(X1, X2, color = label)) +
	geom_point(size = 1, alpha = 0.7) +
	theme_minimal() +
	labs(x = "UMAP 1", y = "UMAP 2", title = "UMAP on Synthetic Clusters") +
	scale_color_brewer(type = "qual", palette = "Set1")

	#### Load and run the R package uwot
	#install.packages("uwot")
	#library(uwot)

	set.seed(1222026) # for reproducibility

	# Normalize data (center rows to mean 0 for cosine similarity)
	data_norm <- scale(data, center = TRUE, scale = FALSE)

	embedding_cosine <- uwot::umap(
	data_norm,
	n_neighbors = 15,
	n_components = 2,
	min_dist = 0.1,
	metric = "cosine",
	n_threads = 0
	)


	### Visualize Cosin distance
	library(ggplot2)
	plot_df_cosine <- data.frame(embedding_cosine, label = labels)
	ggplot(plot_df_cosine, aes(X1, X2, color = label)) +
	geom_point(size = 1, alpha = 0.7) +
	theme_minimal() +
	labs(x = "UMAP 1 (Cosine)", y = "UMAP 2 (Cosine)",
	title = "uwot UMAP with Cosine Metric") +
	scale_color_brewer(type = "qual", palette = "Set1")


	#### Use uwot on mixed type data
	set.seed(1222026)

	n <- 300

	# Numeric block 1 (e.g. continuous measurements)
	num1 <- matrix(rnorm(n * 5, 0, 1), ncol = 5)
	colnames(num1) <- paste0("cont", 1:5)

	# Numeric block 2 (e.g. counts, different scale)
	num2 <- matrix(rpois(n * 3, lambda = 5), ncol = 3)
	colnames(num2) <- paste0("count", 1:3)

	# Categorical variables
	cat1 <- factor(sample(letters[1:3], n, replace = TRUE))
	cat2 <- factor(sample(c("low", "med", "high"), n, replace = TRUE))

	mixed_df <- data.frame(num1, num2, cat1 = cat1, cat2 = cat2)
	str(mixed_df)



	set.seed(1222026) # for reproducibility

	emb_mixed <- uwot::umap(
	X = mixed_df,
	n_neighbors = 15,
	n_components = 2,
	scale = "Z",
	metric = list(
	euclidean = c("cont1", "cont2", "cont3", "cont4", "cont5"),
	manhattan = c("count1", "count2", "count3"),
	categorical = "cat1",
	categorical = "cat2"
	)
	)

	library(ggplot2)

	plot_df <- data.frame(
	UMAP1 = emb_mixed[, 1],
	UMAP2 = emb_mixed[, 2],
	cat1 = mixed_df$cat1,
	cat2 = mixed_df$cat2
	)

	ggplot(plot_df, aes(UMAP1, UMAP2, color = cat1)) +
	geom_point(alpha = 0.7, size = 1.5) +
	theme_minimal() +
	labs(title = "uwot UMAP on mixed data", color = "cat1")