Created
February 19, 2017 13:23
-
-
Save yabyzq/997ee3c591a1af01fb3857fbc19ede4d to your computer and use it in GitHub Desktop.
basic SOM
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(kohonen) | |
library(Hmisc) # useful for multi histogram | |
#loading client data | |
client <- read.csv(file = "C:/Users/eye1/Desktop/Customer Data.csv") | |
#Feature selection | |
data_train <- client[, c("NO_OF_PRODUCTS_6","Balance","LOGIN_DAYS_L3M", "ONLINE_TRANSACTION_L3M")]#data_train <- iris[c(1,2,3,4)] | |
#Explore the data | |
hist.data.frame(data_train) # histogram of each variable#pairs(data_train) | |
ggplot(data_train, aes(x = data_train$ONLINE_TRANSACTION_L3M)) + geom_histogram() +scale_x_continuous(limits = c(0, 100)) | |
#Clean data | |
data_train[is.na(data_train)] <- 0 | |
data_train$Balance[data_train$Balance < 1] <- 1 | |
data_train$log_balance <- log(data_train$Balance) | |
data_train$Balance <- NULL | |
data_train$ONLINE_TRANSACTION_L3M[data_train$ONLINE_TRANSACTION_L3M>30] <- 30 | |
#data_train <- sapply(data_train, function(x) scale(x, center = T, scale = T)) | |
data_train <- iris[c(1,2,3,4)] | |
data_train <- data_train[rep(seq_len(nrow(data_train)), 20), ] | |
write.csv(data_train,"abc.csv") | |
# Change the data frame with training data to a matrix | |
# Also center and scale all variables to give them equal importance during | |
# the SOM training process. | |
data_train_matrix <- as.matrix(scale(data_train)) | |
# Create the SOM Grid - you generally have to specify the size of the | |
# training grid prior to training the SOM. Hexagonal and Circular | |
# Finally, train the SOM, options for the number of iterations, | |
# the learning rates, and the neighbourhood are available | |
som_model <- som(data_train_matrix, grid=somgrid(xdim = 20, ydim=20, topo="hexagonal"), | |
keep.data = TRUE, n.hood="circular" ) | |
#Plot the iteration to check distance drop | |
plot(som_model, type="changes") | |
#Number of points per cell and distance against other cells | |
plot(som_model, type="count")#Grey Empty | |
plot(som_model, type="dist.neighbours") | |
#Show code used | |
plot(som_model, type="codes") | |
#Show property mapped | |
coolBlueHotRed <- function(n, alpha = 1) {rainbow(n, end=4/6, alpha=alpha)[n:1]} | |
par(mfrow=c(2,2)) | |
plot(som_model, type = "property", property = som_model$codes[,1], main=colnames(som_model$data)[1], palette.name=coolBlueHotRed) | |
plot(som_model, type = "property", property = som_model$codes[,2], main=colnames(som_model$data)[2], palette.name=coolBlueHotRed) | |
plot(som_model, type = "property", property = som_model$codes[,3], main=colnames(som_model$data)[3], palette.name=coolBlueHotRed) | |
plot(som_model, type = "property", property = som_model$codes[,4], main=colnames(som_model$data)[4], palette.name=coolBlueHotRed) | |
#Show before scale | |
var <- 2 #define the variable to plot | |
var_unscaled <- aggregate(as.numeric(data_train[,var]), by=list(som_model$unit.classif), FUN=mean, simplify=TRUE)[,2] | |
plot(som_model, type = "property", property=var_unscaled, main=names(data_train)[var], palette.name=coolBlueHotRed) | |
#Clustering cells | |
mydata <- som_model$codes | |
wss <- (nrow(mydata)-1)*sum(apply(mydata,2,var)) | |
for (i in 2:15) { | |
wss[i] <- sum(kmeans(mydata, centers=i)$withinss) | |
} | |
plot(wss) | |
## use hierarchical clustering to cluster the codebook vectors | |
som_cluster <- cutree(hclust(dist(som_model$codes)), 8) | |
# plot these results: | |
pretty_palette <- c("#1f77b4", '#ff7f0e', '#2ca02c', '#d62728', '#9467bd', '#8c564b', '#e377c2', '#76b7b2') | |
par(mfrow=c(1,2)) | |
plot(som_model, type="codes") | |
plot(som_model, type="mapping", bgcol = pretty_palette[som_cluster], main = "Clusters") | |
add.cluster.boundaries(som_model, som_cluster) | |
grid <- som_model$grid$pts | |
gridFrame <- data.frame(x = grid[,1], y = grid[,2]) | |
print(nearPoints(gridFrame, input$plot_click, xvar = "x", yvar = "y")) | |
population <- cbind(data_train, som = som_model$unit.classif) | |
x = 3 | |
y = 3 | |
column = 20 | |
index = x+y*column | |
cell <- population[data_combine$som == 68,] | |
combined <- rbind(cbind(population, label = 'population'), cbind(cell, label = 'cell')) | |
par(mfrow=c(2,2)) | |
p1 <- ggplot(combined, aes(x=combined[, 1]), ) + geom_density(aes(fill = label), alpha=0.2) + xlab(names(combined)[1]) | |
p2 <- ggplot(combined, aes(x=combined[, 2]), ) + geom_density(aes(fill = label), alpha=0.2) + xlab(names(combined)[2]) | |
p3 <- ggplot(combined, aes(x=combined[, 3]), ) + geom_density(aes(fill = label), alpha=0.2) + xlab(names(combined)[3]) | |
p4 <- ggplot(combined, aes(x=combined[, 4]), ) + geom_density(aes(fill = label), alpha=0.2) + xlab(names(combined)[4]) | |
require(gridExtra) | |
grid.arrange(p1, p2, p3, p4, ncol=2) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment