Skip to content

Instantly share code, notes, and snippets.

@primaryobjects
Created May 4, 2016 19:37
Show Gist options
  • Select an option

  • Save primaryobjects/df43c1341730948c74bbd3f3b7b1dce2 to your computer and use it in GitHub Desktop.

Select an option

Save primaryobjects/df43c1341730948c74bbd3f3b7b1dce2 to your computer and use it in GitHub Desktop.
Predicting insurance claim buckets with D2Hawkeye dataset and CART models.
library(caTools)
library(rpart)
library(rpart.plot)
claims <- read.csv('claimsdata.csv')
table(claims$bucket2009) / nrow(claims)
set.seed(88)
spl <- sample.split(claims$bucket2009, SplitRatio = 0.6)
train <- subset(claims, spl == TRUE)
test <- subset(claims, spl == FALSE)
mean(claims$age)
table(claims$diabetes)[2] / nrow(claims)
# Baseline is the bucket for 2009 will be the same as 2008.
table(test$bucket2009, test$bucket2008)
# 1 2 3 4 5
# 1 110138 7787 3427 1452 174
# 2 16000 10721 4629 2931 559
# 3 7006 4629 2774 1621 360
# 4 2688 1943 1415 1539 352
# 5 293 191 160 309 104
# Accuracy: 0.6838135
(110138 + 10721 + 2774 + 1539 + 104) / nrow(test)
# Create a penalty matrix for the healthcare D2Hawkeye system.
penaltyMatrix <- matrix(c(0, 1, 2, 3, 4, 2, 0, 1, 2, 3, 4, 2, 0, 1, 2, 6, 4, 2, 0, 1, 8, 6, 4, 2, 0), byrow=TRUE, nrow=5)
# Calculate penalty error: 0.7386055
as.matrix(table(test$bucket2009, test$bucket2008)) * penaltyMatrix
sum(as.matrix(table(test$bucket2009, test$bucket2008)) * penaltyMatrix) / nrow(test)
# So goal is to produce a model that can beat the baseline accuracy of .68 with a lower penalty than 0.74.
# Alternative baseline method just predict the most frequent outcome (bucket 1) for all observations.
table(test$bucket2009, rep(1, length(test$bucket2009)))
# 1
# 1 122978
# 2 34840
# 3 16390
# 4 7937
# 5 1057
# Accuracy: 0.67127
122978 / nrow(test)
# Fill in remainder of columns with 0 (no other predicted bucket except for 1).
df <- as.matrix(table(test$bucket2009, rep(1, length(test$bucket2009))))
df <- cbind(df, rep(0, 5))
df <- cbind(df, rep(0, 5))
df <- cbind(df, rep(0, 5))
df <- cbind(df, rep(0, 5))
# Calculate penalty on alternative baseline: 0.5221504
sum(df * penaltyMatrix) / nrow(test)
ClaimsTree <- rpart(bucket2009 ~ age + arthritis + alzheimers + cancer + copd + depression + diabetes + heart.failure + ihd + kidney + osteoporosis + stroke + bucket2008 + reimbursement2008, data=train, method='class', cp=0.00005)
prp(ClaimsTree)
predictTest <- predict(ClaimsTree, newdata=test, type='class')
table(test$bucket2009, predictTest)
# 1 2 3 4 5
# 1 114141 8610 124 103 0
# 2 18409 16102 187 142 0
# 3 8027 8146 118 99 0
# 4 3099 4584 53 201 0
# 5 351 657 4 45 0
# Accuracy: 0.7126669
(114141 + 16102 + 118 + 201 + 0) / nrow(test)
# Calculate penalty: 0.7578902
sum(as.matrix(table(test$bucket2009, predictTest)) * penaltyMatrix) / nrow(test)
# So, we've increased the accuracy, but the error penalty increased.
#Let's tell rpart to use a penalty so it tries to build a different tree with a lower penalty error, maybe slightly lower accuracy.
ClaimsTree2 <- rpart(bucket2009 ~ age + arthritis + alzheimers + cancer + copd + depression + diabetes + heart.failure + ihd + kidney + osteoporosis + stroke + bucket2008 + reimbursement2008, data=train, method='class', cp=0.00005, parms=list(loss=penaltyMatrix))
predictTest2 <- predict(ClaimsTree2, newdata=test, type='class')
table(test$bucket2009, predictTest2)
# 1 2 3 4 5
# 1 94310 25295 3087 286 0
# 2 7176 18942 8079 643 0
# 3 3590 7706 4692 401 1
# 4 1304 3193 2803 636 1
# 5 135 356 408 156 2
# Accuracy: 0.6472746
(94310 + 18942 + 4692 + 636 + 2) / nrow(test)
# Calculate penalty: 0.6418161
# We have a lower accuracy, but the penalty is also reduced.
sum(as.matrix(table(test$bucket2009, predictTest2)) * penaltyMatrix) / nrow(test)
# In the previous video, we constructed two CART models. The first CART model, without the loss matrix, predicted bucket 1 for 78.6% of the observations in the test set. Did the second CART model, with the loss matrix, predict bucket 1 for more or fewer of the observations, and why?
# 0.5814074 (less than first model)
(94310+7176+3590+1304+135)/nrow(test)
# Note in the penalty matrix, predicted bucket is along the top and actual results are on the left.
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment