mingjiphd · January 28, 2026 02:23
diff --git a/Readme.txt b/Readme.txt
 This R script is a step by step demonstration on how to use the superSOM function in the R package kohonen for self-organizing maps.   superSOM offers flexibility on multilayer data of different types,  weighted analysis, heterogeneous distance functions and resilience to missing data.   In this video, we show how to perform SOM analysis on multilayer data sets with and without missing values, how to evaluate the quality of the map, how to make predictions, how to extract cluster assignments and how to choose the grid size.  
 Click here to view a video demo:  https://youtu.be/5yIZhGmWf_o?si=JRnrU2eAxwlv6eBP 
diff --git a/R Script b/R Script
 ###########################################################################
 #                                    Machine Learning using R  How to use the superSOM function                                     #
 ############################################################################

 ###  In the R package kohonen, there is another function superSOM
 ###  Features of superSOM
 ### Unified Multi-Layer Architecture: 
 #           Trains a single 2D grid where each node holds representative profiles 
 #           (codebooks) for multiple data layers, such as features and labels.  

 ### Weighted Supervision: 
 #            Uses importance weights to emphasize specific layers (e.g., labels) 
 #            during training, making them more influential in map organization.  


 ### Heterogeneous Distance Rules: 
 #             Allows each layer to use its own distance metric, handling both continuous 
 #             and categorical data naturally.  


 ### Missing Data Resilience: 
 #             Positions observations using only available attributes, ignoring 
 #             missing values while anchoring points based on remaining layers.  


 ### Topological Forced-Alignment: 
 #             Projects all layers onto a shared 2D surface so clusters capture 
 #             consistent patterns across the dataset.  

 

 # Install and load the kohonen package if you haven't already
 # install.packages("kohonen")
 library(kohonen)

 # Set seed for reproducibility
 set.seed(1212026)

 # 1. Simulate Layer 1: Continuous Measurements (e.g., Physiochemical properties)
 # We create 3 distinct clusters of data points
 group1 <- matrix(rnorm(100, mean = 2, sd = 0.5), ncol = 5)
 group2 <- matrix(rnorm(100, mean = 5, sd = 0.5), ncol = 5)
 group3 <- matrix(rnorm(100, mean = 8, sd = 0.5), ncol = 5)

 layer1_data <- rbind(group1, group2, group3)
 colnames(layer1_data) <- paste0("Var", 1:5)

 # 2. Simulate Layer 2: Categorical Labels (The "Supervision" layer)
 # We create a factor that roughly corresponds to our groups
 labels <- factor(rep(c("Type_A", "Type_B", "Type_C"), each = 20))

 # 3. Combine into a list (Required format for superSOM)
 sim_data_list <- list(measurements = layer1_data, 
                      classes = labels)

 ### Train the superSOM
 # Define the SOM grid (e.g., 5x5 hexagonal grid)
 som_grid <- somgrid(xdim = 5, ydim = 5, topo = "hexagonal")

 # Train the model
 # We give 'measurements' a weight of 1 and 'classes' a weight of 2
 # Corrected call using 'user.weights'
 som_model <- supersom(sim_data_list, 
                      grid = som_grid, 
                      user.weights = c(1, 2), # Note the name change here
                      keep.data = TRUE)

 summary(som_model)

 ### Visualize results
 # Plot 1: Training progress (Mean distance to closest unit)
 plot(som_model, type = "changes")

 # Plot 2: Codes (The weights of the variables in each node)
 # To see the numeric variable weights (Var1, Var2, etc.)
 plot(som_model, type = "codes", whatmap = 1, main = "Layer 1: Measurements")

 # To see the classification weights (Type_A, Type_B, etc.)
 plot(som_model, type = "codes", whatmap = 2, main = "Layer 2: Classes")

 # Plot 3: Mapping the original classes
 plot(som_model, type = "mapping", 
     labels = as.integer(sim_data_list$classes), 
     col = as.integer(sim_data_list$classes),
     main = "Cluster Mapping")


 #### Prediction
 # 1. Create the numeric data as a matrix
 # Note: It MUST have the same number of columns as the training data
 new_obs <- matrix(c(5.1, 4.9, 5.0, 5.2, 4.8), nrow = 1)

 # 2. Assign column names (predict() often fails if names are missing or different)
 colnames(new_obs) <- colnames(sim_data_list$measurements)

 # 3. Create the list with THE SAME NAME as the training list
 # If your training list used 'measurements', this must use 'measurements'
 new_data_list <- list(measurements = new_obs)

 # 4. Run prediction
 # Note: You do NOT need to include the 'classes' layer in the list for prediction
 prediction <- predict(som_model, newdata = new_data_list)

 # View result
 print(prediction$predictions$classes)



 #### Evaluate the Quality of the Map
 ## Quantification Errors
 # Calculate the mean distance of objects to their closest unit
 # This is stored in the 'distances' component of the model
 quant_error <- mean(som_model$distances)
 print(paste("Mean Quantization Error:", round(quant_error, 4)))

 # Plot the quality (distance of each point to its node)
 plot(som_model, type = "quality", main = "Quantization Error per Node")
 #### How to judg quality of the map from this plot
 ##             A relatively even, low error nodes across the map

 ## Neighborhood Distance (U Matrix)
 # Plot the U-Matrix
 plot(som_model, type = "dist.neighbours", main = "Neighbor Distance (U-Matrix)")
 ####Notes:   How to judg quality of the map from this plot
 ##           A good mix of light (valley) and dark (edge/wall)
 ##           Smooth color progression

 ##Layer Specific Quality
 # Plot 'counts' to see how many samples fell into each node
 plot(som_model, type = "counts", main = "Observations per Node")
 #### Note:    Want to see an even spread of colors




 #### Extract cluster assignments
 # 1. Get the Winning Unit (Node ID) for each observation
 node_assignments <- som_model$unit.classif

 # 2. Get the X and Y coordinates of those nodes on the grid
 # som_model$grid$pts contains the coordinates for every node
 node_coords <- som_model$grid$pts[node_assignments, ]

 # 3. Combine into a clean data frame
 results <- data.frame(
  Original_Label = sim_data_list$classes,
  Node_ID = node_assignments,
  Grid_X = node_coords[, 1],
  Grid_Y = node_coords[, 2]
 )

 # View the first few rows
 head(results)

 ### Group nodes into 'super clusters'
 # Perform hierarchical clustering on the codebook vectors
 # We use the first layer (measurements) for this
 som_cluster <- cutree(hclust(dist(som_model$codes[[1]])), k = 3)

 # Add this "Super-Cluster" assignment to our results
 results$Super_Cluster <- som_cluster[node_assignments]

 # Visualize the clusters on the SOM map
 plot(som_model, type = "mapping", 
     bgcol = c("red", "blue", "green")[som_cluster], 
     main = "Hierarchical Clusters on SOM")


 #### Exporting data
 # Export to CSV
 write.csv(results, "SOM_Cluster_Results.csv", row.names = FALSE)

 message("Results have been exported to SOM_Cluster_Results.csv")


 ####Choosing grid size: A rule of thumb  Vesanto et al  5*SQRT(N)  N=sample size

 get_optimal_som_grid <- function(data, topo = "hexagonal") {
  # 1. Calculate total number of observations
  # For superSOM, we look at the first element of the list
  n_obs <- nrow(as.matrix(data[[1]]))
  
  # 2. Vesanto's Rule: Total nodes = 5 * sqrt(n_obs)
  total_nodes <- 5 * sqrt(n_obs)
  
  # 3. Find dimensions (trying to keep it square-ish)
  # We take the square root of total_nodes for x and y
  dim_size <- ceiling(sqrt(total_nodes))
  
  message(paste("Suggested grid size:", dim_size, "x", dim_size, 
                "(Total nodes:", dim_size^2, "for", n_obs, "samples)"))
  
  # 4. Return the somgrid object
  return(somgrid(xdim = dim_size, ydim = dim_size, topo = topo))
 }

 # Test with our simulated data (60 observations)
 my_grid <- get_optimal_som_grid(sim_data_list)
 # Result: ~38 nodes (roughly 6x6 or 7x7)

 # Test with a larger dataset (e.g., 2000 observations)
 large_data_test <- list(matrix(rnorm(10000), ncol=5))
 large_grid <- get_optimal_som_grid(large_data_test)
 # Result: ~223 nodes (roughly 15x15)

 #### Handling missing data
 # Create a copy of our simulated measurements
 data_with_nas <- sim_data_list$measurements

 # Randomly set 10% of the values to NA
 set.seed(1212026)
 na_indices <- sample(1:length(data_with_nas), size = 0.1 * length(data_with_nas))
 data_with_nas[na_indices] <- NA

 # Rebuild the list
 sim_data_na_list <- list(measurements = data_with_nas, 
                         classes = sim_data_list$classes)

 # Check if NAs are present
 sum(is.na(sim_data_na_list$measurements))

 # Train the model exactly as before
 som_model_na <- supersom(sim_data_na_list, 
                         grid = som_grid, 
                         user.weights = c(1, 1))
 ####Note: superSOM does not do listwise deletion when there is a missing value 
 ##        it has an internal process for distance scaling and best matching
 ##        However, there is a limit for the amount of missing data permitted
 ##        controlled by the maxNA.fraction argument

 # Check if the map still looks reasonable
 plot(som_model_na, type = "changes")

 # Use the predict function on the data containing NAs
 imputed_data <- predict(som_model_na)

 # The 'predictions' list will contain completed matrices
 # with the NAs filled by the values of the winning nodes
 completed_measurements <- imputed_data$predictions$measurements

 # Compare an original NA value with its new imputed value
 original_na_pos <- na_indices[1]
 print(paste("Imputed value for missing data:", completed_measurements[original_na_pos]))
diff --git a/VideoDmo b/VideoDmo
 Click here to view a video demo:  https://youtu.be/5yIZhGmWf_o?si=JRnrU2eAxwlv6eBP
	###########################################################################
	# Machine Learning using R How to use the superSOM function #
	############################################################################

	### In the R package kohonen, there is another function superSOM
	### Features of superSOM
	### Unified Multi-Layer Architecture:
	# Trains a single 2D grid where each node holds representative profiles
	# (codebooks) for multiple data layers, such as features and labels.

	### Weighted Supervision:
	# Uses importance weights to emphasize specific layers (e.g., labels)
	# during training, making them more influential in map organization.


	### Heterogeneous Distance Rules:
	# Allows each layer to use its own distance metric, handling both continuous
	# and categorical data naturally.


	### Missing Data Resilience:
	# Positions observations using only available attributes, ignoring
	# missing values while anchoring points based on remaining layers.


	### Topological Forced-Alignment:
	# Projects all layers onto a shared 2D surface so clusters capture
	# consistent patterns across the dataset.



	# Install and load the kohonen package if you haven't already
	# install.packages("kohonen")
	library(kohonen)

	# Set seed for reproducibility
	set.seed(1212026)

	# 1. Simulate Layer 1: Continuous Measurements (e.g., Physiochemical properties)
	# We create 3 distinct clusters of data points
	group1 <- matrix(rnorm(100, mean = 2, sd = 0.5), ncol = 5)
	group2 <- matrix(rnorm(100, mean = 5, sd = 0.5), ncol = 5)
	group3 <- matrix(rnorm(100, mean = 8, sd = 0.5), ncol = 5)

	layer1_data <- rbind(group1, group2, group3)
	colnames(layer1_data) <- paste0("Var", 1:5)

	# 2. Simulate Layer 2: Categorical Labels (The "Supervision" layer)
	# We create a factor that roughly corresponds to our groups
	labels <- factor(rep(c("Type_A", "Type_B", "Type_C"), each = 20))

	# 3. Combine into a list (Required format for superSOM)
	sim_data_list <- list(measurements = layer1_data,
	classes = labels)

	### Train the superSOM
	# Define the SOM grid (e.g., 5x5 hexagonal grid)
	som_grid <- somgrid(xdim = 5, ydim = 5, topo = "hexagonal")

	# Train the model
	# We give 'measurements' a weight of 1 and 'classes' a weight of 2
	# Corrected call using 'user.weights'
	som_model <- supersom(sim_data_list,
	grid = som_grid,
	user.weights = c(1, 2), # Note the name change here
	keep.data = TRUE)

	summary(som_model)

	### Visualize results
	# Plot 1: Training progress (Mean distance to closest unit)
	plot(som_model, type = "changes")

	# Plot 2: Codes (The weights of the variables in each node)
	# To see the numeric variable weights (Var1, Var2, etc.)
	plot(som_model, type = "codes", whatmap = 1, main = "Layer 1: Measurements")

	# To see the classification weights (Type_A, Type_B, etc.)
	plot(som_model, type = "codes", whatmap = 2, main = "Layer 2: Classes")

	# Plot 3: Mapping the original classes
	plot(som_model, type = "mapping",
	labels = as.integer(sim_data_list$classes),
	col = as.integer(sim_data_list$classes),
	main = "Cluster Mapping")


	#### Prediction
	# 1. Create the numeric data as a matrix
	# Note: It MUST have the same number of columns as the training data
	new_obs <- matrix(c(5.1, 4.9, 5.0, 5.2, 4.8), nrow = 1)

	# 2. Assign column names (predict() often fails if names are missing or different)
	colnames(new_obs) <- colnames(sim_data_list$measurements)

	# 3. Create the list with THE SAME NAME as the training list
	# If your training list used 'measurements', this must use 'measurements'
	new_data_list <- list(measurements = new_obs)

	# 4. Run prediction
	# Note: You do NOT need to include the 'classes' layer in the list for prediction
	prediction <- predict(som_model, newdata = new_data_list)

	# View result
	print(prediction$predictions$classes)



	#### Evaluate the Quality of the Map
	## Quantification Errors
	# Calculate the mean distance of objects to their closest unit
	# This is stored in the 'distances' component of the model
	quant_error <- mean(som_model$distances)
	print(paste("Mean Quantization Error:", round(quant_error, 4)))

	# Plot the quality (distance of each point to its node)
	plot(som_model, type = "quality", main = "Quantization Error per Node")
	#### How to judg quality of the map from this plot
	## A relatively even, low error nodes across the map

	## Neighborhood Distance (U Matrix)
	# Plot the U-Matrix
	plot(som_model, type = "dist.neighbours", main = "Neighbor Distance (U-Matrix)")
	####Notes: How to judg quality of the map from this plot
	## A good mix of light (valley) and dark (edge/wall)
	## Smooth color progression

	##Layer Specific Quality
	# Plot 'counts' to see how many samples fell into each node
	plot(som_model, type = "counts", main = "Observations per Node")
	#### Note: Want to see an even spread of colors




	#### Extract cluster assignments
	# 1. Get the Winning Unit (Node ID) for each observation
	node_assignments <- som_model$unit.classif

	# 2. Get the X and Y coordinates of those nodes on the grid
	# som_model$grid$pts contains the coordinates for every node
	node_coords <- som_model$grid$pts[node_assignments, ]

	# 3. Combine into a clean data frame
	results <- data.frame(
	Original_Label = sim_data_list$classes,
	Node_ID = node_assignments,
	Grid_X = node_coords[, 1],
	Grid_Y = node_coords[, 2]
	)

	# View the first few rows
	head(results)

	### Group nodes into 'super clusters'
	# Perform hierarchical clustering on the codebook vectors
	# We use the first layer (measurements) for this
	som_cluster <- cutree(hclust(dist(som_model$codes[[1]])), k = 3)

	# Add this "Super-Cluster" assignment to our results
	results$Super_Cluster <- som_cluster[node_assignments]

	# Visualize the clusters on the SOM map
	plot(som_model, type = "mapping",
	bgcol = c("red", "blue", "green")[som_cluster],
	main = "Hierarchical Clusters on SOM")


	#### Exporting data
	# Export to CSV
	write.csv(results, "SOM_Cluster_Results.csv", row.names = FALSE)

	message("Results have been exported to SOM_Cluster_Results.csv")


	####Choosing grid size: A rule of thumb Vesanto et al 5*SQRT(N) N=sample size

	get_optimal_som_grid <- function(data, topo = "hexagonal") {
	# 1. Calculate total number of observations
	# For superSOM, we look at the first element of the list
	n_obs <- nrow(as.matrix(data[[1]]))

	# 2. Vesanto's Rule: Total nodes = 5 * sqrt(n_obs)
	total_nodes <- 5 * sqrt(n_obs)

	# 3. Find dimensions (trying to keep it square-ish)
	# We take the square root of total_nodes for x and y
	dim_size <- ceiling(sqrt(total_nodes))

	message(paste("Suggested grid size:", dim_size, "x", dim_size,
	"(Total nodes:", dim_size^2, "for", n_obs, "samples)"))

	# 4. Return the somgrid object
	return(somgrid(xdim = dim_size, ydim = dim_size, topo = topo))
	}

	# Test with our simulated data (60 observations)
	my_grid <- get_optimal_som_grid(sim_data_list)
	# Result: ~38 nodes (roughly 6x6 or 7x7)

	# Test with a larger dataset (e.g., 2000 observations)
	large_data_test <- list(matrix(rnorm(10000), ncol=5))
	large_grid <- get_optimal_som_grid(large_data_test)
	# Result: ~223 nodes (roughly 15x15)

	#### Handling missing data
	# Create a copy of our simulated measurements
	data_with_nas <- sim_data_list$measurements

	# Randomly set 10% of the values to NA
	set.seed(1212026)
	na_indices <- sample(1:length(data_with_nas), size = 0.1 * length(data_with_nas))
	data_with_nas[na_indices] <- NA

	# Rebuild the list
	sim_data_na_list <- list(measurements = data_with_nas,
	classes = sim_data_list$classes)

	# Check if NAs are present
	sum(is.na(sim_data_na_list$measurements))

	# Train the model exactly as before
	som_model_na <- supersom(sim_data_na_list,
	grid = som_grid,
	user.weights = c(1, 1))
	####Note: superSOM does not do listwise deletion when there is a missing value
	## it has an internal process for distance scaling and best matching
	## However, there is a limit for the amount of missing data permitted
	## controlled by the maxNA.fraction argument

	# Check if the map still looks reasonable
	plot(som_model_na, type = "changes")

	# Use the predict function on the data containing NAs
	imputed_data <- predict(som_model_na)

	# The 'predictions' list will contain completed matrices
	# with the NAs filled by the values of the winning nodes
	completed_measurements <- imputed_data$predictions$measurements

	# Compare an original NA value with its new imputed value
	original_na_pos <- na_indices[1]
	print(paste("Imputed value for missing data:", completed_measurements[original_na_pos]))
No results found