primaryobjects · May 4, 2016 19:37
diff --git a/claims.R b/claims.R
 library(caTools)
 library(rpart)
 library(rpart.plot)

 claims <- read.csv('claimsdata.csv')

 table(claims$bucket2009) / nrow(claims)

 set.seed(88)
 spl <- sample.split(claims$bucket2009, SplitRatio = 0.6)
 train <- subset(claims, spl == TRUE)
 test <- subset(claims, spl == FALSE)

 mean(claims$age)
 table(claims$diabetes)[2] / nrow(claims)

 # Baseline is the bucket for 2009 will be the same as 2008.
 table(test$bucket2009, test$bucket2008)
 #       1      2      3      4      5
 # 1 110138   7787   3427   1452    174
 # 2  16000  10721   4629   2931    559
 # 3   7006   4629   2774   1621    360
 # 4   2688   1943   1415   1539    352
 # 5    293    191    160    309    104
 # Accuracy: 0.6838135
 (110138 + 10721 + 2774 + 1539 + 104) / nrow(test)

 # Create a penalty matrix for the healthcare D2Hawkeye system.
 penaltyMatrix <- matrix(c(0, 1, 2, 3, 4, 2, 0, 1, 2, 3, 4, 2, 0, 1, 2, 6, 4, 2, 0, 1, 8, 6, 4, 2, 0), byrow=TRUE, nrow=5)

 # Calculate penalty error: 0.7386055
 as.matrix(table(test$bucket2009, test$bucket2008)) * penaltyMatrix
 sum(as.matrix(table(test$bucket2009, test$bucket2008)) * penaltyMatrix) / nrow(test)

 # So goal is to produce a model that can beat the baseline accuracy of .68 with a lower penalty than 0.74.

 # Alternative baseline method just predict the most frequent outcome (bucket 1) for all observations.
 table(test$bucket2009, rep(1, length(test$bucket2009)))
 #        1
 # 1 122978
 # 2  34840
 # 3  16390
 # 4   7937
 # 5   1057
 # Accuracy: 0.67127
 122978 / nrow(test)

 # Fill in remainder of columns with 0 (no other predicted bucket except for 1).
 df <- as.matrix(table(test$bucket2009, rep(1, length(test$bucket2009))))
 df <- cbind(df, rep(0, 5))
 df <- cbind(df, rep(0, 5))
 df <- cbind(df, rep(0, 5))
 df <- cbind(df, rep(0, 5))

 # Calculate penalty on alternative baseline: 0.5221504
 sum(df * penaltyMatrix) / nrow(test)

 ClaimsTree <- rpart(bucket2009 ~ age + arthritis + alzheimers + cancer + copd + depression + diabetes + heart.failure + ihd + kidney + osteoporosis + stroke + bucket2008 + reimbursement2008, data=train, method='class', cp=0.00005)
 prp(ClaimsTree)

 predictTest <- predict(ClaimsTree, newdata=test, type='class')
 table(test$bucket2009, predictTest)
 #        1      2      3      4      5
 # 1 114141   8610    124    103      0
 # 2  18409  16102    187    142      0
 # 3   8027   8146    118     99      0
 # 4   3099   4584     53    201      0
 # 5    351    657      4     45      0
 # Accuracy: 0.7126669
 (114141 + 16102 + 118 + 201 + 0) / nrow(test)

 # Calculate penalty: 0.7578902
 sum(as.matrix(table(test$bucket2009, predictTest)) * penaltyMatrix) / nrow(test)

 # So, we've increased the accuracy, but the error penalty increased.
 #Let's tell rpart to use a penalty so it tries to build a different tree with a lower penalty error, maybe slightly lower accuracy.
 ClaimsTree2 <- rpart(bucket2009 ~ age + arthritis + alzheimers + cancer + copd + depression + diabetes + heart.failure + ihd + kidney + osteoporosis + stroke + bucket2008 + reimbursement2008, data=train, method='class', cp=0.00005, parms=list(loss=penaltyMatrix))
 predictTest2 <- predict(ClaimsTree2, newdata=test, type='class')
 table(test$bucket2009, predictTest2)
 #       1     2     3     4     5
 # 1 94310 25295  3087   286     0
 # 2  7176 18942  8079   643     0
 # 3  3590  7706  4692   401     1
 # 4  1304  3193  2803   636     1
 # 5   135   356   408   156     2
 # Accuracy: 0.6472746
 (94310 + 18942 + 4692 + 636 + 2) / nrow(test)

 # Calculate penalty: 0.6418161
 # We have a lower accuracy, but the penalty is also reduced.
 sum(as.matrix(table(test$bucket2009, predictTest2)) * penaltyMatrix) / nrow(test)

 # In the previous video, we constructed two CART models. The first CART model, without the loss matrix, predicted bucket 1 for 78.6% of the observations in the test set. Did the second CART model, with the loss matrix, predict bucket 1 for more or fewer of the observations, and why?
 # 0.5814074 (less than first model)
 (94310+7176+3590+1304+135)/nrow(test)

 # Note in the penalty matrix, predicted bucket is along the top and actual results are on the left.
	library(caTools)
	library(rpart)
	library(rpart.plot)

	claims <- read.csv('claimsdata.csv')

	table(claims$bucket2009) / nrow(claims)

	set.seed(88)
	spl <- sample.split(claims$bucket2009, SplitRatio = 0.6)
	train <- subset(claims, spl == TRUE)
	test <- subset(claims, spl == FALSE)

	mean(claims$age)
	table(claims$diabetes)[2] / nrow(claims)

	# Baseline is the bucket for 2009 will be the same as 2008.
	table(test$bucket2009, test$bucket2008)
	# 1 2 3 4 5
	# 1 110138 7787 3427 1452 174
	# 2 16000 10721 4629 2931 559
	# 3 7006 4629 2774 1621 360
	# 4 2688 1943 1415 1539 352
	# 5 293 191 160 309 104
	# Accuracy: 0.6838135
	(110138 + 10721 + 2774 + 1539 + 104) / nrow(test)

	# Create a penalty matrix for the healthcare D2Hawkeye system.
	penaltyMatrix <- matrix(c(0, 1, 2, 3, 4, 2, 0, 1, 2, 3, 4, 2, 0, 1, 2, 6, 4, 2, 0, 1, 8, 6, 4, 2, 0), byrow=TRUE, nrow=5)

	# Calculate penalty error: 0.7386055
	as.matrix(table(test$bucket2009, test$bucket2008)) * penaltyMatrix
	sum(as.matrix(table(test$bucket2009, test$bucket2008)) * penaltyMatrix) / nrow(test)

	# So goal is to produce a model that can beat the baseline accuracy of .68 with a lower penalty than 0.74.

	# Alternative baseline method just predict the most frequent outcome (bucket 1) for all observations.
	table(test$bucket2009, rep(1, length(test$bucket2009)))
	# 1
	# 1 122978
	# 2 34840
	# 3 16390
	# 4 7937
	# 5 1057
	# Accuracy: 0.67127
	122978 / nrow(test)

	# Fill in remainder of columns with 0 (no other predicted bucket except for 1).
	df <- as.matrix(table(test$bucket2009, rep(1, length(test$bucket2009))))
	df <- cbind(df, rep(0, 5))
	df <- cbind(df, rep(0, 5))
	df <- cbind(df, rep(0, 5))
	df <- cbind(df, rep(0, 5))

	# Calculate penalty on alternative baseline: 0.5221504
	sum(df * penaltyMatrix) / nrow(test)

	ClaimsTree <- rpart(bucket2009 ~ age + arthritis + alzheimers + cancer + copd + depression + diabetes + heart.failure + ihd + kidney + osteoporosis + stroke + bucket2008 + reimbursement2008, data=train, method='class', cp=0.00005)
	prp(ClaimsTree)

	predictTest <- predict(ClaimsTree, newdata=test, type='class')
	table(test$bucket2009, predictTest)
	# 1 2 3 4 5
	# 1 114141 8610 124 103 0
	# 2 18409 16102 187 142 0
	# 3 8027 8146 118 99 0
	# 4 3099 4584 53 201 0
	# 5 351 657 4 45 0
	# Accuracy: 0.7126669
	(114141 + 16102 + 118 + 201 + 0) / nrow(test)

	# Calculate penalty: 0.7578902
	sum(as.matrix(table(test$bucket2009, predictTest)) * penaltyMatrix) / nrow(test)

	# So, we've increased the accuracy, but the error penalty increased.
	#Let's tell rpart to use a penalty so it tries to build a different tree with a lower penalty error, maybe slightly lower accuracy.
	ClaimsTree2 <- rpart(bucket2009 ~ age + arthritis + alzheimers + cancer + copd + depression + diabetes + heart.failure + ihd + kidney + osteoporosis + stroke + bucket2008 + reimbursement2008, data=train, method='class', cp=0.00005, parms=list(loss=penaltyMatrix))
	predictTest2 <- predict(ClaimsTree2, newdata=test, type='class')
	table(test$bucket2009, predictTest2)
	# 1 2 3 4 5
	# 1 94310 25295 3087 286 0
	# 2 7176 18942 8079 643 0
	# 3 3590 7706 4692 401 1
	# 4 1304 3193 2803 636 1
	# 5 135 356 408 156 2
	# Accuracy: 0.6472746
	(94310 + 18942 + 4692 + 636 + 2) / nrow(test)

	# Calculate penalty: 0.6418161
	# We have a lower accuracy, but the penalty is also reduced.
	sum(as.matrix(table(test$bucket2009, predictTest2)) * penaltyMatrix) / nrow(test)

	# In the previous video, we constructed two CART models. The first CART model, without the loss matrix, predicted bucket 1 for 78.6% of the observations in the test set. Did the second CART model, with the loss matrix, predict bucket 1 for more or fewer of the observations, and why?
	# 0.5814074 (less than first model)
	(94310+7176+3590+1304+135)/nrow(test)

	# Note in the penalty matrix, predicted bucket is along the top and actual results are on the left.
No results found