Last active
November 10, 2017 09:32
-
-
Save CSJCampbell/d7bc022c9f9129f093997fd87c9692bc to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
########################################################### | |
# | |
# Introduction to Text Mining | |
# ccampbell | |
# 2017-11-09 | |
# Script to learn text mining basics | |
# | |
########################################################### | |
########################################################### | |
# text mining | |
########################################################### | |
library(readr) | |
library(dplyr) | |
library(SnowballC) | |
library(stringr) | |
library(text2vec) | |
library(glmnet) | |
########################################################### | |
# business question | |
# - which treatment types are poorly reimbursed by Medicare | |
########################################################### | |
# actions | |
# - advice for staff to prefer certain equivalent treatments with higher reimbursement | |
# - focus cost savings on poorly reimbursed treatments | |
########################################################### | |
# analytic question | |
# - does billing more than 2 * allowed amount | |
# increase the proportion of allowed amount paid | |
# - which treatment keywords are have a high/low proportion funded | |
# Obtain data | |
# https://data.cms.gov/Medicare-Physician-Supplier/Medicare-Provider-Utilization-and-Payment-Data-Phy/sk9b-znav/data | |
# downsampled data | |
costs <- read_csv("http://bit.ly/MedicareManR17") | |
dim(costs) | |
# small numbers for funded_prop did not get much of their costs reimbursed | |
costs <- costs %>% | |
mutate(large_bill = `Average Submitted Charge Amount` > (2 * `Average Medicare Allowed Amount`), | |
funded_prop = `Average Medicare Payment Amount` / `Average Submitted Charge Amount`) | |
costs %>% | |
group_by(large_bill) %>% | |
summarize( | |
n = n(), | |
av = mean(funded_prop[is.finite(funded_prop)]), | |
med = median(funded_prop), | |
q05 = quantile(funded_prop, probs = 0.05), | |
q95 = quantile(funded_prop, probs = 0.95)) | |
hist(log(costs$funded_prop)) | |
costs <- filter(costs, is.finite(funded_prop)) | |
# train and test | |
set.seed(62877) | |
isTest <- as.logical(rbinom(n = nrow(costs), size = 1, prob = 0.3)) | |
test <- filter(costs, isTest) | |
train <- filter(costs, !isTest) | |
dim(train) | |
# Extract data | |
# frameworks: | |
# * tm - create document corpus | |
# * tidytext | |
# * text2vec - process data frames directly | |
# preprocess text | |
sample(x = train$`HCPCS Description`, size = 3) | |
# stemming | |
# reduce number of terms | |
wordStem(c("treats", "treatments", "treating", "tree", "treat", "treated")) | |
table(wordStem(c("treats", "treatments", "treating", "tree", "treat", "treated"))) | |
# numeric function creates numeric vector | |
numVec <- numeric(length = 3) | |
numVec | |
mode(numVec) | |
class(numVec) | |
# call function on vec | |
mean(numVec) | |
# basic data type | |
mode(mean) | |
# special label | |
class(mean) | |
# function function creates function | |
# assign to name, arguments are arguments function will have | |
# following expression is algorithm to be applied to arguments | |
addFun <- function(a, b) { | |
a + b | |
} | |
addFun(a = 1, b = 2) | |
addFun(a = 1, b = 2:6) | |
# preprocessing generally requires analysis specific preprocessor | |
# e.g. clean using regular expressions | |
?regex | |
# an example preprocessor | |
# roxygen2 style markup | |
#' @title Custom text preprocessor | |
#' @description removes punctuation | |
#' @param stri character vector | |
#' @return character vector | |
#' @examples | |
#' prep(c("Hospital observation care typically 50 minutes", | |
#' "Initial hospital inpatient care, typically 30 minutes per day")) | |
prep <- function(stri) { | |
# replacing punctuation | |
stri <- str_replace_all(string = stri, | |
pattern = "[[:punct:]]", | |
replacement = " ") | |
# unify case | |
stri <- tolower(stri) | |
# consolidate number/days | |
# dataset is regularized, so these are common patterns | |
stri <- str_replace_all(string = stri, | |
pattern = "[0-9,\\.]+ minute(s|) per day", replacement = "daily_mins_") | |
stri <- str_replace_all(string = stri, | |
pattern = "[0-9,\\.]+ minute(s|)", replacement = "mins_") | |
# break each element into vector of words | |
stri <- str_split(string = stri, pattern = " +") | |
# stem each word, then stick back into sentances | |
stri <- vapply(stri, | |
FUN = function(x) { paste(wordStem(x), collapse = " ") }, | |
FUN.VALUE = character(1)) | |
return(unlist(stri)) | |
} | |
# check behaviour | |
# unit test e.g. testthat allows you to automate | |
# comparing function output with expected output | |
prep(c("Hospital observation care typically 50 minutes", | |
"Initial hospital inpatient care, typically 30 minutes per day")) | |
########################################################### | |
# text2vec | |
# create iterator | |
# performs tokenization (split sentances to words) | |
itTrain <- itoken(train$`HCPCS Description`, | |
preprocessor = prep, | |
tokenizer = word_tokenizer, | |
progressbar = FALSE) | |
# create vocabulary | |
vocab <- create_vocabulary(itTrain, | |
stopwords = c("typically", "or", "of", "and")) | |
# create vectorizer | |
vectorizer <- vocab_vectorizer(vocab) | |
# create document text matrix | |
dtmTrain <- create_dtm(itTrain, | |
vectorizer = vectorizer) | |
dim(dtmTrain) | |
# sparse matrix | |
# takes up less memory than nrow * ncol | |
class(dtmTrain) | |
dtmTrain[1:6, 1:5] | |
# every column is unique word | |
head(colnames(dtmTrain)) | |
# word occurs once in one record | |
dtmTrain[dtmTrain[, "fulvestr"] != 0, "fulvestr"] | |
########################################################### | |
# glmnet | |
# create classifier | |
glmnetClassifier <- glmnet( | |
x = dtmTrain, | |
# dependent variable | |
y = train$funded_prop, | |
# describes distribution of response | |
family = "gaussian") | |
# glmnet object | |
names(glmnetClassifier) | |
# glmnet fits model with a vector of penalization parameters | |
# increasing the penalty reduces the number of parameters included | |
# use cv.glmnet to find optimal penalty | |
glmnetClassifier$a0 | |
# glmnet models have parameter values dependent on the | |
# penalization parameter | |
plot(glmnetClassifier) | |
# model coefficients for specified penalization | |
coef3 <- coef(glmnetClassifier, s = 4) | |
# which parameters are non-zero | |
coef3[coef3[, 1] != 0, , drop = FALSE] | |
# self predictions | |
train <- mutate(train, | |
glmnetPreds = predict(glmnetClassifier, newx = dtmTrain, s = 4)) | |
# assess fit | |
# Dependent variable ~ Predictions | |
# | |
# data points should be should be closely | |
# and uniformly distributed around line of unity | |
plot(funded_prop ~ glmnetPreds, | |
data = train) | |
abline(coef = c(0, 1), lty = 2) | |
# Residuals ~ Predictions | |
# | |
# The residuals should be scattered uniformly around 0. | |
plot(funded_prop - glmnetPreds ~ glmnetPreds, | |
data = train) | |
abline(coef = c(0, 0), lty = 2) | |
# not ideal, but still maybe useful | |
# next steps, | |
# * experiment with changing preprocessing | |
# * experiment with tuning parameters | |
# (e.g. force certain parameters to be included) | |
# once happy with model, create training DTM as above, | |
# then predict using final model | |
# do not peak until model development complete! | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment