Created
June 16, 2016 19:39
-
-
Save arthurwuhoo/f802c1318f616b37b0d911d606b15e07 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# ------------------------------------------------------------------ | |
# DAY 14: CLUSTERING EXERCISES | |
# ------------------------------------------------------------------ | |
# ------------------------------------------------------------------ | |
# EXERCISE 1 | |
# Use kk Means Clustering to group the observations in the mtcars data. Is it | |
#important to standardise these data first? Vary the number of clusters and choose | |
# an appropriate value for kk. Interpret the clusters. | |
# ------------------------------------------------------------------ | |
(mtcars.kmeans <- kmeans(mtcars, 3, nstart = 20)) | |
summary(mtcars) #looks like we should standardize so that the features are treated | |
#somewhat equally. | |
scaled.cars <- scale(mtcars) | |
ratios = sapply(1:10, function(k) { | |
with(kmeans(scaled.cars, k, nstart = 20), betweenss / totss) | |
}) | |
plot(1:10, ratios) #it seems like 3-4 clusters seems to be optimal | |
clusters <- kmeans(scaled.cars, 4) | |
clusters$centers | |
#the first cluster has poor mpg, a high number of cylinders, a lighter weight. | |
#the second cluster has the worst mpg, the highest number of cylinders + horsepower, #the greatest weight, more likely to be automatic. | |
#the third cluster has the best mpg, the lowest number of cylinders, the least | |
#horsepower, the lightest weight, the most likely to be manual | |
#the fourth cluster is pretty much average everything. | |
# ------------------------------------------------------------------ | |
# EXERCISE 2 | |
# Use the animals data from the cluster package to perform hierarchical clustering. | |
#Try different distance measures and select the one that makes the most sense. Check | |
#?dist to ensure that you understand how missing values are treated. | |
# ------------------------------------------------------------------ | |
library(cluster) | |
animals #looks like there are some NA values. | |
rownames(animals) #looks like names are also just abbreviated | |
animals <- na.omit(animals) | |
kmeans(animals,2) | |
animals.dist <- dist(animals, method = "euclidean") | |
(animals.hclust <- hclust(animals.dist)) | |
plot(animals.hclust) | |
# ------------------------------------------------------------------ | |
# EXERCISE 3 | |
# Perform hierarchical clustering on the mtcars data. Plot the dendrogram. Does the | |
#structure make sense? | |
# ------------------------------------------------------------------ | |
scaled.cars #the standardized mtcars data we were working with | |
cars.dist <- dist(scaled.cars, method = "euclidean") | |
(cars.hclust <- hclust(cars.dist)) | |
plot(cars.hclust) | |
#i think so. the toyota corolla and volvo are in the same | |
#data clusters. another cluster has luxury sports cars in them. | |
# ------------------------------------------------------------------ | |
# EXERCISE 4 | |
# Perform hierarchical clustering on the USArrests data. Try all possible linkage | |
#methods and see what effect they have on the results | |
# ------------------------------------------------------------------ | |
scaled.arrests <- scale(USArrests) | |
arrests.dist <- dist(scaled.arrests, method = "euclidean") | |
(arrests.hclust <- hclust(arrests.dist, method = "complete")) | |
plot(arrests.hclust) # | |
#In complete-link clustering or complete-linkage clustering , the | |
#similarity of two clusters is the similarity of their most dissimilar | |
#members. | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment