primaryobjects · October 13, 2015 14:59
diff --git a/mmds-q5b.R b/mmds-q5b.R
 #
 # Mining Massive Datasets Quiz 5B
 #

 # Q1
 # We wish to cluster the following set of points: into 10 clusters. We initially choose each of the green points (25,125), (44,105), (29,97), (35,63), (55,63), (42,57), (23,40), (64,37), (33,22), and (55,20) as a centroid. Assign each of the gold points to their nearest centroid. (Note: the scales of the horizontal and vertical axes differ, so you really need to apply the formula for distance of points; you can't just "eyeball" it.) Then, recompute the centroids of each of the clusters. Do any of the points then get reassigned to a new cluster on the next round? Identify the true statement in the list below. Each statement refers either to a centroid AFTER recomputation of centroids (precise to one decimal place) or to a point that gets reclassified.

 # Setup data.
 centroids <- t(data.frame(c(25,125), c(44,105), c(29,97), c(35,63), c(55,63), c(42,57), c(23,40), c(64,37), c(33,22), c(55,20)))
 points <- t(data.frame(c(28,145), c(65,140), c(50,130), c(55,118), c(38,115), c(50,90), c(63,88), c(43,83), c(50,60), c(50,30)))
 points <- rbind(points, centroids)

 # Run k-means for 1 iteration.
 cl <- kmeans(points, centroids, 1, algorithm = 'Lloyd')

 cl$centers

 # Q2
 # When performing a k-means clustering, success depends very much on the initially chosen points. Suppose that we choose two centroids (a,b) = (5,10) and (c,d) = (20,5), and the data truly belongs to two rectangular clusters, as suggested by the following diagram:
 # Under what circumstances will the initial clustering be successful? That is, under what conditions will all the yellow points be assigned to the centroid (5,10), while all of the blue points are assigned to cluster (20,5))? Identify in the list below, a pair of rectangles (described by their upper left corner, UL, and their lower-right corner LR) that are successfully clustered.

 # Calculate euclidian distance from each point to centroid.
 # Centroids
 ab <- c(5, 10)
 cd <- c(20, 5)

 # Goes to yellow
 dist(t(data.frame(c(3,15), ab)))
 dist(t(data.frame(c(3,15), cd)))
 # Goes to blue
 dist(t(data.frame(c(13,7), ab)))
 dist(t(data.frame(c(13,7), cd)))


 # Goes to yellow
 dist(t(data.frame(c(6,7), ab)))
 dist(t(data.frame(c(6,7), cd)))
 # Goes to yellow
 dist(t(data.frame(c(11,4), ab)))
 dist(t(data.frame(c(11,4), cd)))
 # Goes to blue
 dist(t(data.frame(c(14,10), ab)))
 dist(t(data.frame(c(14,10), cd)))
 # Goes to blue
 dist(t(data.frame(c(23,6), ab)))
 dist(t(data.frame(c(23,6), cd)))


 # Goes to yellow
 dist(t(data.frame(c(7,8), ab)))
 dist(t(data.frame(c(7,8), cd)))
 # Goes to blue
 dist(t(data.frame(c(12,5), ab)))
 dist(t(data.frame(c(12,5), cd)))


 # Goes to yellow
 dist(t(data.frame(c(6,7), ab)))
 dist(t(data.frame(c(6,7), cd)))
 # Goes to yellow
 dist(t(data.frame(c(11,4), ab)))
 dist(t(data.frame(c(11,4), cd)))
 # Goes to yellow
 dist(t(data.frame(c(11,5), ab)))
 dist(t(data.frame(c(11,5), cd)))
 # Goes to blue
 dist(t(data.frame(c(17,2), ab)))
 dist(t(data.frame(c(17,2), cd)))

 # Q3
 # Suppose we apply the BALANCE algorithm with bids of 0 or 1 only, to a situation where advertiser A bids on query words x and y, while advertiser B bids on query words x and z. Both have a budget of $2. Identify in the list below a sequence of four queries that will certainly be handled optimally by the algorithm.

 # Bid goes to bidder with most money reminaing. Default to A. Go through eacy query until queries are filled and both bidders have $0 left.
 # xyzx


 # Q5
 # This bipartite graph:
 # Has several perfect matchings. Find all the perfect matchings and then identify, in the list below, a pair of edges that can appear together in a perfect matching.

 #a0/b0
 #a1/b2
 #a2/b4
 #a3/b1
 #a4/b3

 #a0/b1
 #a1/b3
 #a2/b0
 #a3/b2
 #a4/b4
	#
	# Mining Massive Datasets Quiz 5B
	#

	# Q1
	# We wish to cluster the following set of points: into 10 clusters. We initially choose each of the green points (25,125), (44,105), (29,97), (35,63), (55,63), (42,57), (23,40), (64,37), (33,22), and (55,20) as a centroid. Assign each of the gold points to their nearest centroid. (Note: the scales of the horizontal and vertical axes differ, so you really need to apply the formula for distance of points; you can't just "eyeball" it.) Then, recompute the centroids of each of the clusters. Do any of the points then get reassigned to a new cluster on the next round? Identify the true statement in the list below. Each statement refers either to a centroid AFTER recomputation of centroids (precise to one decimal place) or to a point that gets reclassified.

	# Setup data.
	centroids <- t(data.frame(c(25,125), c(44,105), c(29,97), c(35,63), c(55,63), c(42,57), c(23,40), c(64,37), c(33,22), c(55,20)))
	points <- t(data.frame(c(28,145), c(65,140), c(50,130), c(55,118), c(38,115), c(50,90), c(63,88), c(43,83), c(50,60), c(50,30)))
	points <- rbind(points, centroids)

	# Run k-means for 1 iteration.
	cl <- kmeans(points, centroids, 1, algorithm = 'Lloyd')

	cl$centers

	# Q2
	# When performing a k-means clustering, success depends very much on the initially chosen points. Suppose that we choose two centroids (a,b) = (5,10) and (c,d) = (20,5), and the data truly belongs to two rectangular clusters, as suggested by the following diagram:
	# Under what circumstances will the initial clustering be successful? That is, under what conditions will all the yellow points be assigned to the centroid (5,10), while all of the blue points are assigned to cluster (20,5))? Identify in the list below, a pair of rectangles (described by their upper left corner, UL, and their lower-right corner LR) that are successfully clustered.

	# Calculate euclidian distance from each point to centroid.
	# Centroids
	ab <- c(5, 10)
	cd <- c(20, 5)

	# Goes to yellow
	dist(t(data.frame(c(3,15), ab)))
	dist(t(data.frame(c(3,15), cd)))
	# Goes to blue
	dist(t(data.frame(c(13,7), ab)))
	dist(t(data.frame(c(13,7), cd)))


	# Goes to yellow
	dist(t(data.frame(c(6,7), ab)))
	dist(t(data.frame(c(6,7), cd)))
	# Goes to yellow
	dist(t(data.frame(c(11,4), ab)))
	dist(t(data.frame(c(11,4), cd)))
	# Goes to blue
	dist(t(data.frame(c(14,10), ab)))
	dist(t(data.frame(c(14,10), cd)))
	# Goes to blue
	dist(t(data.frame(c(23,6), ab)))
	dist(t(data.frame(c(23,6), cd)))


	# Goes to yellow
	dist(t(data.frame(c(7,8), ab)))
	dist(t(data.frame(c(7,8), cd)))
	# Goes to blue
	dist(t(data.frame(c(12,5), ab)))
	dist(t(data.frame(c(12,5), cd)))


	# Goes to yellow
	dist(t(data.frame(c(6,7), ab)))
	dist(t(data.frame(c(6,7), cd)))
	# Goes to yellow
	dist(t(data.frame(c(11,4), ab)))
	dist(t(data.frame(c(11,4), cd)))
	# Goes to yellow
	dist(t(data.frame(c(11,5), ab)))
	dist(t(data.frame(c(11,5), cd)))
	# Goes to blue
	dist(t(data.frame(c(17,2), ab)))
	dist(t(data.frame(c(17,2), cd)))

	# Q3
	# Suppose we apply the BALANCE algorithm with bids of 0 or 1 only, to a situation where advertiser A bids on query words x and y, while advertiser B bids on query words x and z. Both have a budget of $2. Identify in the list below a sequence of four queries that will certainly be handled optimally by the algorithm.

	# Bid goes to bidder with most money reminaing. Default to A. Go through eacy query until queries are filled and both bidders have $0 left.
	# xyzx


	# Q5
	# This bipartite graph:
	# Has several perfect matchings. Find all the perfect matchings and then identify, in the list below, a pair of edges that can appear together in a perfect matching.

	#a0/b0
	#a1/b2
	#a2/b4
	#a3/b1
	#a4/b3

	#a0/b1
	#a1/b3
	#a2/b0
	#a3/b2
	#a4/b4