arthurwuhoo · June 16, 2016 19:39
diff --git a/Day 14: Clustering Exercises.R b/Day 14: Clustering Exercises.R
 # ------------------------------------------------------------------
 # DAY 14: CLUSTERING EXERCISES
 # ------------------------------------------------------------------
 # ------------------------------------------------------------------
 # EXERCISE 1
 # Use kk Means Clustering to group the observations in the mtcars data. Is it 
 #important to standardise these data first? Vary the number of clusters and choose 
 # an appropriate value for kk. Interpret the clusters.
 # ------------------------------------------------------------------ 

 (mtcars.kmeans <- kmeans(mtcars, 3, nstart = 20))
 summary(mtcars) #looks like we should standardize so that the features are treated
 #somewhat equally.

 scaled.cars <- scale(mtcars)

 ratios = sapply(1:10, function(k) {
  with(kmeans(scaled.cars, k, nstart = 20), betweenss / totss)
 })

 plot(1:10, ratios) #it seems like 3-4 clusters seems to be optimal
 clusters <- kmeans(scaled.cars, 4)
 clusters$centers

 #the first cluster has poor mpg, a high number of cylinders, a lighter weight.

 #the second cluster has the worst mpg, the highest number of cylinders + horsepower, #the greatest weight, more likely to be automatic.

 #the third cluster has the best mpg, the lowest number of cylinders, the least
 #horsepower, the lightest weight, the most likely to be manual

 #the fourth cluster is pretty much average everything.

 # ------------------------------------------------------------------
 # EXERCISE 2
 # Use the animals data from the cluster package to perform hierarchical clustering. 
 #Try different distance measures and select the one that makes the most sense. Check
 #?dist to ensure that you understand how missing values are treated.
 # ------------------------------------------------------------------ 

 library(cluster)
 animals #looks like there are some NA values.
 rownames(animals) #looks like names are also just abbreviated
 animals <- na.omit(animals)

 kmeans(animals,2)

 animals.dist <- dist(animals, method = "euclidean")
 (animals.hclust <- hclust(animals.dist))
 plot(animals.hclust)


 # ------------------------------------------------------------------
 # EXERCISE 3
 # Perform hierarchical clustering on the mtcars data. Plot the dendrogram. Does the 
 #structure make sense?
 # ------------------------------------------------------------------
 
 scaled.cars #the standardized mtcars data we were working with

 cars.dist <- dist(scaled.cars, method = "euclidean")
 (cars.hclust <- hclust(cars.dist))
 plot(cars.hclust) 
 #i think so. the toyota corolla and volvo are in the same
 #data clusters. another cluster has luxury sports cars in them.

 # ------------------------------------------------------------------
 # EXERCISE 4
 # Perform hierarchical clustering on the USArrests data. Try all possible linkage 
 #methods and see what effect they have on the results
 # ------------------------------------------------------------------

 scaled.arrests <- scale(USArrests)

 arrests.dist <- dist(scaled.arrests, method = "euclidean")
 (arrests.hclust <- hclust(arrests.dist, method = "complete"))
 plot(arrests.hclust) #

 #In complete-link clustering or complete-linkage clustering , the 
 #similarity of two clusters is the similarity of their most dissimilar 
 #members.
	# ------------------------------------------------------------------
	# DAY 14: CLUSTERING EXERCISES
	# ------------------------------------------------------------------
	# ------------------------------------------------------------------
	# EXERCISE 1
	# Use kk Means Clustering to group the observations in the mtcars data. Is it
	#important to standardise these data first? Vary the number of clusters and choose
	# an appropriate value for kk. Interpret the clusters.
	# ------------------------------------------------------------------

	(mtcars.kmeans <- kmeans(mtcars, 3, nstart = 20))
	summary(mtcars) #looks like we should standardize so that the features are treated
	#somewhat equally.

	scaled.cars <- scale(mtcars)

	ratios = sapply(1:10, function(k) {
	with(kmeans(scaled.cars, k, nstart = 20), betweenss / totss)
	})

	plot(1:10, ratios) #it seems like 3-4 clusters seems to be optimal
	clusters <- kmeans(scaled.cars, 4)
	clusters$centers

	#the first cluster has poor mpg, a high number of cylinders, a lighter weight.

	#the second cluster has the worst mpg, the highest number of cylinders + horsepower, #the greatest weight, more likely to be automatic.

	#the third cluster has the best mpg, the lowest number of cylinders, the least
	#horsepower, the lightest weight, the most likely to be manual

	#the fourth cluster is pretty much average everything.

	# ------------------------------------------------------------------
	# EXERCISE 2
	# Use the animals data from the cluster package to perform hierarchical clustering.
	#Try different distance measures and select the one that makes the most sense. Check
	#?dist to ensure that you understand how missing values are treated.
	# ------------------------------------------------------------------

	library(cluster)
	animals #looks like there are some NA values.
	rownames(animals) #looks like names are also just abbreviated
	animals <- na.omit(animals)

	kmeans(animals,2)

	animals.dist <- dist(animals, method = "euclidean")
	(animals.hclust <- hclust(animals.dist))
	plot(animals.hclust)


	# ------------------------------------------------------------------
	# EXERCISE 3
	# Perform hierarchical clustering on the mtcars data. Plot the dendrogram. Does the
	#structure make sense?
	# ------------------------------------------------------------------

	scaled.cars #the standardized mtcars data we were working with

	cars.dist <- dist(scaled.cars, method = "euclidean")
	(cars.hclust <- hclust(cars.dist))
	plot(cars.hclust)
	#i think so. the toyota corolla and volvo are in the same
	#data clusters. another cluster has luxury sports cars in them.

	# ------------------------------------------------------------------
	# EXERCISE 4
	# Perform hierarchical clustering on the USArrests data. Try all possible linkage
	#methods and see what effect they have on the results
	# ------------------------------------------------------------------

	scaled.arrests <- scale(USArrests)

	arrests.dist <- dist(scaled.arrests, method = "euclidean")
	(arrests.hclust <- hclust(arrests.dist, method = "complete"))
	plot(arrests.hclust) #

	#In complete-link clustering or complete-linkage clustering , the
	#similarity of two clusters is the similarity of their most dissimilar
	#members.