Skip to content

Instantly share code, notes, and snippets.

@yabyzq
Created February 19, 2017 13:23
Show Gist options
  • Save yabyzq/997ee3c591a1af01fb3857fbc19ede4d to your computer and use it in GitHub Desktop.
Save yabyzq/997ee3c591a1af01fb3857fbc19ede4d to your computer and use it in GitHub Desktop.
basic SOM
library(kohonen)
library(Hmisc) # useful for multi histogram
#loading client data
client <- read.csv(file = "C:/Users/eye1/Desktop/Customer Data.csv")
#Feature selection
data_train <- client[, c("NO_OF_PRODUCTS_6","Balance","LOGIN_DAYS_L3M", "ONLINE_TRANSACTION_L3M")]#data_train <- iris[c(1,2,3,4)]
#Explore the data
hist.data.frame(data_train) # histogram of each variable#pairs(data_train)
ggplot(data_train, aes(x = data_train$ONLINE_TRANSACTION_L3M)) + geom_histogram() +scale_x_continuous(limits = c(0, 100))
#Clean data
data_train[is.na(data_train)] <- 0
data_train$Balance[data_train$Balance < 1] <- 1
data_train$log_balance <- log(data_train$Balance)
data_train$Balance <- NULL
data_train$ONLINE_TRANSACTION_L3M[data_train$ONLINE_TRANSACTION_L3M>30] <- 30
#data_train <- sapply(data_train, function(x) scale(x, center = T, scale = T))
data_train <- iris[c(1,2,3,4)]
data_train <- data_train[rep(seq_len(nrow(data_train)), 20), ]
write.csv(data_train,"abc.csv")
# Change the data frame with training data to a matrix
# Also center and scale all variables to give them equal importance during
# the SOM training process.
data_train_matrix <- as.matrix(scale(data_train))
# Create the SOM Grid - you generally have to specify the size of the
# training grid prior to training the SOM. Hexagonal and Circular
# Finally, train the SOM, options for the number of iterations,
# the learning rates, and the neighbourhood are available
som_model <- som(data_train_matrix, grid=somgrid(xdim = 20, ydim=20, topo="hexagonal"),
keep.data = TRUE, n.hood="circular" )
#Plot the iteration to check distance drop
plot(som_model, type="changes")
#Number of points per cell and distance against other cells
plot(som_model, type="count")#Grey Empty
plot(som_model, type="dist.neighbours")
#Show code used
plot(som_model, type="codes")
#Show property mapped
coolBlueHotRed <- function(n, alpha = 1) {rainbow(n, end=4/6, alpha=alpha)[n:1]}
par(mfrow=c(2,2))
plot(som_model, type = "property", property = som_model$codes[,1], main=colnames(som_model$data)[1], palette.name=coolBlueHotRed)
plot(som_model, type = "property", property = som_model$codes[,2], main=colnames(som_model$data)[2], palette.name=coolBlueHotRed)
plot(som_model, type = "property", property = som_model$codes[,3], main=colnames(som_model$data)[3], palette.name=coolBlueHotRed)
plot(som_model, type = "property", property = som_model$codes[,4], main=colnames(som_model$data)[4], palette.name=coolBlueHotRed)
#Show before scale
var <- 2 #define the variable to plot
var_unscaled <- aggregate(as.numeric(data_train[,var]), by=list(som_model$unit.classif), FUN=mean, simplify=TRUE)[,2]
plot(som_model, type = "property", property=var_unscaled, main=names(data_train)[var], palette.name=coolBlueHotRed)
#Clustering cells
mydata <- som_model$codes
wss <- (nrow(mydata)-1)*sum(apply(mydata,2,var))
for (i in 2:15) {
wss[i] <- sum(kmeans(mydata, centers=i)$withinss)
}
plot(wss)
## use hierarchical clustering to cluster the codebook vectors
som_cluster <- cutree(hclust(dist(som_model$codes)), 8)
# plot these results:
pretty_palette <- c("#1f77b4", '#ff7f0e', '#2ca02c', '#d62728', '#9467bd', '#8c564b', '#e377c2', '#76b7b2')
par(mfrow=c(1,2))
plot(som_model, type="codes")
plot(som_model, type="mapping", bgcol = pretty_palette[som_cluster], main = "Clusters")
add.cluster.boundaries(som_model, som_cluster)
grid <- som_model$grid$pts
gridFrame <- data.frame(x = grid[,1], y = grid[,2])
print(nearPoints(gridFrame, input$plot_click, xvar = "x", yvar = "y"))
population <- cbind(data_train, som = som_model$unit.classif)
x = 3
y = 3
column = 20
index = x+y*column
cell <- population[data_combine$som == 68,]
combined <- rbind(cbind(population, label = 'population'), cbind(cell, label = 'cell'))
par(mfrow=c(2,2))
p1 <- ggplot(combined, aes(x=combined[, 1]), ) + geom_density(aes(fill = label), alpha=0.2) + xlab(names(combined)[1])
p2 <- ggplot(combined, aes(x=combined[, 2]), ) + geom_density(aes(fill = label), alpha=0.2) + xlab(names(combined)[2])
p3 <- ggplot(combined, aes(x=combined[, 3]), ) + geom_density(aes(fill = label), alpha=0.2) + xlab(names(combined)[3])
p4 <- ggplot(combined, aes(x=combined[, 4]), ) + geom_density(aes(fill = label), alpha=0.2) + xlab(names(combined)[4])
require(gridExtra)
grid.arrange(p1, p2, p3, p4, ncol=2)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment