yabyzq · February 19, 2017 13:23
diff --git a/SOM - Basic.R b/SOM - Basic.R
 library(kohonen)
 library(Hmisc)    # useful for multi histogram

 #loading client data
 client <- read.csv(file = "C:/Users/eye1/Desktop/Customer Data.csv")

 #Feature selection
 data_train <- client[, c("NO_OF_PRODUCTS_6","Balance","LOGIN_DAYS_L3M", "ONLINE_TRANSACTION_L3M")]#data_train <- iris[c(1,2,3,4)]

 #Explore the data
 hist.data.frame(data_train)  # histogram of each variable#pairs(data_train) 
 ggplot(data_train, aes(x = data_train$ONLINE_TRANSACTION_L3M)) + geom_histogram() +scale_x_continuous(limits = c(0, 100))

 #Clean data
 data_train[is.na(data_train)] <- 0
 data_train$Balance[data_train$Balance < 1] <- 1
 data_train$log_balance <- log(data_train$Balance)
 data_train$Balance <- NULL
 data_train$ONLINE_TRANSACTION_L3M[data_train$ONLINE_TRANSACTION_L3M>30] <- 30
 #data_train <- sapply(data_train, function(x) scale(x, center = T, scale = T))






 data_train <- iris[c(1,2,3,4)]
 data_train <- data_train[rep(seq_len(nrow(data_train)), 20), ]

 write.csv(data_train,"abc.csv")
 # Change the data frame with training data to a matrix
 # Also center and scale all variables to give them equal importance during
 # the SOM training process. 
 data_train_matrix <- as.matrix(scale(data_train))

 # Create the SOM Grid - you generally have to specify the size of the
 # training grid prior to training the SOM. Hexagonal and Circular
 # Finally, train the SOM, options for the number of iterations,
 # the learning rates, and the neighbourhood are available
 som_model <- som(data_train_matrix, grid=somgrid(xdim = 20, ydim=20, topo="hexagonal"), 
                 keep.data = TRUE, n.hood="circular" )

 #Plot the iteration to check distance drop
 plot(som_model, type="changes")

 #Number of points per cell and distance against other cells
 plot(som_model, type="count")#Grey Empty
 plot(som_model, type="dist.neighbours")

 #Show code used
 plot(som_model, type="codes")

 #Show property mapped
 coolBlueHotRed <- function(n, alpha = 1) {rainbow(n, end=4/6, alpha=alpha)[n:1]}
 par(mfrow=c(2,2))
 plot(som_model, type = "property", property = som_model$codes[,1], main=colnames(som_model$data)[1], palette.name=coolBlueHotRed)
 plot(som_model, type = "property", property = som_model$codes[,2], main=colnames(som_model$data)[2], palette.name=coolBlueHotRed)
 plot(som_model, type = "property", property = som_model$codes[,3], main=colnames(som_model$data)[3], palette.name=coolBlueHotRed)
 plot(som_model, type = "property", property = som_model$codes[,4], main=colnames(som_model$data)[4], palette.name=coolBlueHotRed)
 #Show before scale
 var <- 2 #define the variable to plot 
 var_unscaled <- aggregate(as.numeric(data_train[,var]), by=list(som_model$unit.classif), FUN=mean, simplify=TRUE)[,2]
 plot(som_model, type = "property", property=var_unscaled, main=names(data_train)[var], palette.name=coolBlueHotRed)


 #Clustering cells
 mydata <- som_model$codes 
 wss <- (nrow(mydata)-1)*sum(apply(mydata,2,var)) 
 for (i in 2:15) {
  wss[i] <- sum(kmeans(mydata, centers=i)$withinss)
 }
 plot(wss)
 ## use hierarchical clustering to cluster the codebook vectors
 som_cluster <- cutree(hclust(dist(som_model$codes)), 8)
 # plot these results:
 pretty_palette <- c("#1f77b4", '#ff7f0e', '#2ca02c', '#d62728', '#9467bd', '#8c564b', '#e377c2', '#76b7b2')

 par(mfrow=c(1,2))
 plot(som_model, type="codes")
 plot(som_model, type="mapping", bgcol = pretty_palette[som_cluster], main = "Clusters")
 add.cluster.boundaries(som_model, som_cluster)



 grid <- som_model$grid$pts
 gridFrame <- data.frame(x = grid[,1], y = grid[,2])
 print(nearPoints(gridFrame, input$plot_click, xvar = "x", yvar = "y"))    
    
    
 population <- cbind(data_train, som = som_model$unit.classif)
 x = 3
 y = 3
 column = 20
 index = x+y*column
 cell <- population[data_combine$som == 68,]
 combined <- rbind(cbind(population, label = 'population'), cbind(cell, label = 'cell'))
 par(mfrow=c(2,2))
 p1 <- ggplot(combined, aes(x=combined[, 1]), ) + geom_density(aes(fill = label), alpha=0.2) + xlab(names(combined)[1])
 p2 <- ggplot(combined, aes(x=combined[, 2]), ) + geom_density(aes(fill = label), alpha=0.2) + xlab(names(combined)[2])
 p3 <- ggplot(combined, aes(x=combined[, 3]), ) + geom_density(aes(fill = label), alpha=0.2) + xlab(names(combined)[3])
 p4 <- ggplot(combined, aes(x=combined[, 4]), ) + geom_density(aes(fill = label), alpha=0.2) + xlab(names(combined)[4])
 require(gridExtra)
 grid.arrange(p1, p2, p3, p4, ncol=2)
	library(kohonen)
	library(Hmisc) # useful for multi histogram

	#loading client data
	client <- read.csv(file = "C:/Users/eye1/Desktop/Customer Data.csv")

	#Feature selection
	data_train <- client[, c("NO_OF_PRODUCTS_6","Balance","LOGIN_DAYS_L3M", "ONLINE_TRANSACTION_L3M")]#data_train <- iris[c(1,2,3,4)]

	#Explore the data
	hist.data.frame(data_train) # histogram of each variable#pairs(data_train)
	ggplot(data_train, aes(x = data_train$ONLINE_TRANSACTION_L3M)) + geom_histogram() +scale_x_continuous(limits = c(0, 100))

	#Clean data
	data_train[is.na(data_train)] <- 0
	data_train$Balance[data_train$Balance < 1] <- 1
	data_train$log_balance <- log(data_train$Balance)
	data_train$Balance <- NULL
	data_train$ONLINE_TRANSACTION_L3M[data_train$ONLINE_TRANSACTION_L3M>30] <- 30
	#data_train <- sapply(data_train, function(x) scale(x, center = T, scale = T))






	data_train <- iris[c(1,2,3,4)]
	data_train <- data_train[rep(seq_len(nrow(data_train)), 20), ]

	write.csv(data_train,"abc.csv")
	# Change the data frame with training data to a matrix
	# Also center and scale all variables to give them equal importance during
	# the SOM training process.
	data_train_matrix <- as.matrix(scale(data_train))

	# Create the SOM Grid - you generally have to specify the size of the
	# training grid prior to training the SOM. Hexagonal and Circular
	# Finally, train the SOM, options for the number of iterations,
	# the learning rates, and the neighbourhood are available
	som_model <- som(data_train_matrix, grid=somgrid(xdim = 20, ydim=20, topo="hexagonal"),
	keep.data = TRUE, n.hood="circular" )

	#Plot the iteration to check distance drop
	plot(som_model, type="changes")

	#Number of points per cell and distance against other cells
	plot(som_model, type="count")#Grey Empty
	plot(som_model, type="dist.neighbours")

	#Show code used
	plot(som_model, type="codes")

	#Show property mapped
	coolBlueHotRed <- function(n, alpha = 1) {rainbow(n, end=4/6, alpha=alpha)[n:1]}
	par(mfrow=c(2,2))
	plot(som_model, type = "property", property = som_model$codes[,1], main=colnames(som_model$data)[1], palette.name=coolBlueHotRed)
	plot(som_model, type = "property", property = som_model$codes[,2], main=colnames(som_model$data)[2], palette.name=coolBlueHotRed)
	plot(som_model, type = "property", property = som_model$codes[,3], main=colnames(som_model$data)[3], palette.name=coolBlueHotRed)
	plot(som_model, type = "property", property = som_model$codes[,4], main=colnames(som_model$data)[4], palette.name=coolBlueHotRed)
	#Show before scale
	var <- 2 #define the variable to plot
	var_unscaled <- aggregate(as.numeric(data_train[,var]), by=list(som_model$unit.classif), FUN=mean, simplify=TRUE)[,2]
	plot(som_model, type = "property", property=var_unscaled, main=names(data_train)[var], palette.name=coolBlueHotRed)


	#Clustering cells
	mydata <- som_model$codes
	wss <- (nrow(mydata)-1)*sum(apply(mydata,2,var))
	for (i in 2:15) {
	wss[i] <- sum(kmeans(mydata, centers=i)$withinss)
	}
	plot(wss)
	## use hierarchical clustering to cluster the codebook vectors
	som_cluster <- cutree(hclust(dist(som_model$codes)), 8)
	# plot these results:
	pretty_palette <- c("#1f77b4", '#ff7f0e', '#2ca02c', '#d62728', '#9467bd', '#8c564b', '#e377c2', '#76b7b2')

	par(mfrow=c(1,2))
	plot(som_model, type="codes")
	plot(som_model, type="mapping", bgcol = pretty_palette[som_cluster], main = "Clusters")
	add.cluster.boundaries(som_model, som_cluster)



	grid <- som_model$grid$pts
	gridFrame <- data.frame(x = grid[,1], y = grid[,2])
	print(nearPoints(gridFrame, input$plot_click, xvar = "x", yvar = "y"))


	population <- cbind(data_train, som = som_model$unit.classif)
	x = 3
	y = 3
	column = 20
	index = x+y*column
	cell <- population[data_combine$som == 68,]
	combined <- rbind(cbind(population, label = 'population'), cbind(cell, label = 'cell'))
	par(mfrow=c(2,2))
	p1 <- ggplot(combined, aes(x=combined[, 1]), ) + geom_density(aes(fill = label), alpha=0.2) + xlab(names(combined)[1])
	p2 <- ggplot(combined, aes(x=combined[, 2]), ) + geom_density(aes(fill = label), alpha=0.2) + xlab(names(combined)[2])
	p3 <- ggplot(combined, aes(x=combined[, 3]), ) + geom_density(aes(fill = label), alpha=0.2) + xlab(names(combined)[3])
	p4 <- ggplot(combined, aes(x=combined[, 4]), ) + geom_density(aes(fill = label), alpha=0.2) + xlab(names(combined)[4])
	require(gridExtra)
	grid.arrange(p1, p2, p3, p4, ncol=2)