Created
May 4, 2016 19:37
-
-
Save primaryobjects/df43c1341730948c74bbd3f3b7b1dce2 to your computer and use it in GitHub Desktop.
Predicting insurance claim buckets with D2Hawkeye dataset and CART models.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| library(caTools) | |
| library(rpart) | |
| library(rpart.plot) | |
| claims <- read.csv('claimsdata.csv') | |
| table(claims$bucket2009) / nrow(claims) | |
| set.seed(88) | |
| spl <- sample.split(claims$bucket2009, SplitRatio = 0.6) | |
| train <- subset(claims, spl == TRUE) | |
| test <- subset(claims, spl == FALSE) | |
| mean(claims$age) | |
| table(claims$diabetes)[2] / nrow(claims) | |
| # Baseline is the bucket for 2009 will be the same as 2008. | |
| table(test$bucket2009, test$bucket2008) | |
| # 1 2 3 4 5 | |
| # 1 110138 7787 3427 1452 174 | |
| # 2 16000 10721 4629 2931 559 | |
| # 3 7006 4629 2774 1621 360 | |
| # 4 2688 1943 1415 1539 352 | |
| # 5 293 191 160 309 104 | |
| # Accuracy: 0.6838135 | |
| (110138 + 10721 + 2774 + 1539 + 104) / nrow(test) | |
| # Create a penalty matrix for the healthcare D2Hawkeye system. | |
| penaltyMatrix <- matrix(c(0, 1, 2, 3, 4, 2, 0, 1, 2, 3, 4, 2, 0, 1, 2, 6, 4, 2, 0, 1, 8, 6, 4, 2, 0), byrow=TRUE, nrow=5) | |
| # Calculate penalty error: 0.7386055 | |
| as.matrix(table(test$bucket2009, test$bucket2008)) * penaltyMatrix | |
| sum(as.matrix(table(test$bucket2009, test$bucket2008)) * penaltyMatrix) / nrow(test) | |
| # So goal is to produce a model that can beat the baseline accuracy of .68 with a lower penalty than 0.74. | |
| # Alternative baseline method just predict the most frequent outcome (bucket 1) for all observations. | |
| table(test$bucket2009, rep(1, length(test$bucket2009))) | |
| # 1 | |
| # 1 122978 | |
| # 2 34840 | |
| # 3 16390 | |
| # 4 7937 | |
| # 5 1057 | |
| # Accuracy: 0.67127 | |
| 122978 / nrow(test) | |
| # Fill in remainder of columns with 0 (no other predicted bucket except for 1). | |
| df <- as.matrix(table(test$bucket2009, rep(1, length(test$bucket2009)))) | |
| df <- cbind(df, rep(0, 5)) | |
| df <- cbind(df, rep(0, 5)) | |
| df <- cbind(df, rep(0, 5)) | |
| df <- cbind(df, rep(0, 5)) | |
| # Calculate penalty on alternative baseline: 0.5221504 | |
| sum(df * penaltyMatrix) / nrow(test) | |
| ClaimsTree <- rpart(bucket2009 ~ age + arthritis + alzheimers + cancer + copd + depression + diabetes + heart.failure + ihd + kidney + osteoporosis + stroke + bucket2008 + reimbursement2008, data=train, method='class', cp=0.00005) | |
| prp(ClaimsTree) | |
| predictTest <- predict(ClaimsTree, newdata=test, type='class') | |
| table(test$bucket2009, predictTest) | |
| # 1 2 3 4 5 | |
| # 1 114141 8610 124 103 0 | |
| # 2 18409 16102 187 142 0 | |
| # 3 8027 8146 118 99 0 | |
| # 4 3099 4584 53 201 0 | |
| # 5 351 657 4 45 0 | |
| # Accuracy: 0.7126669 | |
| (114141 + 16102 + 118 + 201 + 0) / nrow(test) | |
| # Calculate penalty: 0.7578902 | |
| sum(as.matrix(table(test$bucket2009, predictTest)) * penaltyMatrix) / nrow(test) | |
| # So, we've increased the accuracy, but the error penalty increased. | |
| #Let's tell rpart to use a penalty so it tries to build a different tree with a lower penalty error, maybe slightly lower accuracy. | |
| ClaimsTree2 <- rpart(bucket2009 ~ age + arthritis + alzheimers + cancer + copd + depression + diabetes + heart.failure + ihd + kidney + osteoporosis + stroke + bucket2008 + reimbursement2008, data=train, method='class', cp=0.00005, parms=list(loss=penaltyMatrix)) | |
| predictTest2 <- predict(ClaimsTree2, newdata=test, type='class') | |
| table(test$bucket2009, predictTest2) | |
| # 1 2 3 4 5 | |
| # 1 94310 25295 3087 286 0 | |
| # 2 7176 18942 8079 643 0 | |
| # 3 3590 7706 4692 401 1 | |
| # 4 1304 3193 2803 636 1 | |
| # 5 135 356 408 156 2 | |
| # Accuracy: 0.6472746 | |
| (94310 + 18942 + 4692 + 636 + 2) / nrow(test) | |
| # Calculate penalty: 0.6418161 | |
| # We have a lower accuracy, but the penalty is also reduced. | |
| sum(as.matrix(table(test$bucket2009, predictTest2)) * penaltyMatrix) / nrow(test) | |
| # In the previous video, we constructed two CART models. The first CART model, without the loss matrix, predicted bucket 1 for 78.6% of the observations in the test set. Did the second CART model, with the loss matrix, predict bucket 1 for more or fewer of the observations, and why? | |
| # 0.5814074 (less than first model) | |
| (94310+7176+3590+1304+135)/nrow(test) | |
| # Note in the penalty matrix, predicted bucket is along the top and actual results are on the left. |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment