Skip to content

Instantly share code, notes, and snippets.

@CSJCampbell
Last active November 10, 2017 09:32
Show Gist options
  • Save CSJCampbell/d7bc022c9f9129f093997fd87c9692bc to your computer and use it in GitHub Desktop.
Save CSJCampbell/d7bc022c9f9129f093997fd87c9692bc to your computer and use it in GitHub Desktop.
###########################################################
#
# Introduction to Text Mining
# ccampbell
# 2017-11-09
# Script to learn text mining basics
#
###########################################################
###########################################################
# text mining
###########################################################
library(readr)
library(dplyr)
library(SnowballC)
library(stringr)
library(text2vec)
library(glmnet)
###########################################################
# business question
# - which treatment types are poorly reimbursed by Medicare
###########################################################
# actions
# - advice for staff to prefer certain equivalent treatments with higher reimbursement
# - focus cost savings on poorly reimbursed treatments
###########################################################
# analytic question
# - does billing more than 2 * allowed amount
# increase the proportion of allowed amount paid
# - which treatment keywords are have a high/low proportion funded
# Obtain data
# https://data.cms.gov/Medicare-Physician-Supplier/Medicare-Provider-Utilization-and-Payment-Data-Phy/sk9b-znav/data
# downsampled data
costs <- read_csv("http://bit.ly/MedicareManR17")
dim(costs)
# small numbers for funded_prop did not get much of their costs reimbursed
costs <- costs %>%
mutate(large_bill = `Average Submitted Charge Amount` > (2 * `Average Medicare Allowed Amount`),
funded_prop = `Average Medicare Payment Amount` / `Average Submitted Charge Amount`)
costs %>%
group_by(large_bill) %>%
summarize(
n = n(),
av = mean(funded_prop[is.finite(funded_prop)]),
med = median(funded_prop),
q05 = quantile(funded_prop, probs = 0.05),
q95 = quantile(funded_prop, probs = 0.95))
hist(log(costs$funded_prop))
costs <- filter(costs, is.finite(funded_prop))
# train and test
set.seed(62877)
isTest <- as.logical(rbinom(n = nrow(costs), size = 1, prob = 0.3))
test <- filter(costs, isTest)
train <- filter(costs, !isTest)
dim(train)
# Extract data
# frameworks:
# * tm - create document corpus
# * tidytext
# * text2vec - process data frames directly
# preprocess text
sample(x = train$`HCPCS Description`, size = 3)
# stemming
# reduce number of terms
wordStem(c("treats", "treatments", "treating", "tree", "treat", "treated"))
table(wordStem(c("treats", "treatments", "treating", "tree", "treat", "treated")))
# numeric function creates numeric vector
numVec <- numeric(length = 3)
numVec
mode(numVec)
class(numVec)
# call function on vec
mean(numVec)
# basic data type
mode(mean)
# special label
class(mean)
# function function creates function
# assign to name, arguments are arguments function will have
# following expression is algorithm to be applied to arguments
addFun <- function(a, b) {
a + b
}
addFun(a = 1, b = 2)
addFun(a = 1, b = 2:6)
# preprocessing generally requires analysis specific preprocessor
# e.g. clean using regular expressions
?regex
# an example preprocessor
# roxygen2 style markup
#' @title Custom text preprocessor
#' @description removes punctuation
#' @param stri character vector
#' @return character vector
#' @examples
#' prep(c("Hospital observation care typically 50 minutes",
#' "Initial hospital inpatient care, typically 30 minutes per day"))
prep <- function(stri) {
# replacing punctuation
stri <- str_replace_all(string = stri,
pattern = "[[:punct:]]",
replacement = " ")
# unify case
stri <- tolower(stri)
# consolidate number/days
# dataset is regularized, so these are common patterns
stri <- str_replace_all(string = stri,
pattern = "[0-9,\\.]+ minute(s|) per day", replacement = "daily_mins_")
stri <- str_replace_all(string = stri,
pattern = "[0-9,\\.]+ minute(s|)", replacement = "mins_")
# break each element into vector of words
stri <- str_split(string = stri, pattern = " +")
# stem each word, then stick back into sentances
stri <- vapply(stri,
FUN = function(x) { paste(wordStem(x), collapse = " ") },
FUN.VALUE = character(1))
return(unlist(stri))
}
# check behaviour
# unit test e.g. testthat allows you to automate
# comparing function output with expected output
prep(c("Hospital observation care typically 50 minutes",
"Initial hospital inpatient care, typically 30 minutes per day"))
###########################################################
# text2vec
# create iterator
# performs tokenization (split sentances to words)
itTrain <- itoken(train$`HCPCS Description`,
preprocessor = prep,
tokenizer = word_tokenizer,
progressbar = FALSE)
# create vocabulary
vocab <- create_vocabulary(itTrain,
stopwords = c("typically", "or", "of", "and"))
# create vectorizer
vectorizer <- vocab_vectorizer(vocab)
# create document text matrix
dtmTrain <- create_dtm(itTrain,
vectorizer = vectorizer)
dim(dtmTrain)
# sparse matrix
# takes up less memory than nrow * ncol
class(dtmTrain)
dtmTrain[1:6, 1:5]
# every column is unique word
head(colnames(dtmTrain))
# word occurs once in one record
dtmTrain[dtmTrain[, "fulvestr"] != 0, "fulvestr"]
###########################################################
# glmnet
# create classifier
glmnetClassifier <- glmnet(
x = dtmTrain,
# dependent variable
y = train$funded_prop,
# describes distribution of response
family = "gaussian")
# glmnet object
names(glmnetClassifier)
# glmnet fits model with a vector of penalization parameters
# increasing the penalty reduces the number of parameters included
# use cv.glmnet to find optimal penalty
glmnetClassifier$a0
# glmnet models have parameter values dependent on the
# penalization parameter
plot(glmnetClassifier)
# model coefficients for specified penalization
coef3 <- coef(glmnetClassifier, s = 4)
# which parameters are non-zero
coef3[coef3[, 1] != 0, , drop = FALSE]
# self predictions
train <- mutate(train,
glmnetPreds = predict(glmnetClassifier, newx = dtmTrain, s = 4))
# assess fit
# Dependent variable ~ Predictions
#
# data points should be should be closely
# and uniformly distributed around line of unity
plot(funded_prop ~ glmnetPreds,
data = train)
abline(coef = c(0, 1), lty = 2)
# Residuals ~ Predictions
#
# The residuals should be scattered uniformly around 0.
plot(funded_prop - glmnetPreds ~ glmnetPreds,
data = train)
abline(coef = c(0, 0), lty = 2)
# not ideal, but still maybe useful
# next steps,
# * experiment with changing preprocessing
# * experiment with tuning parameters
# (e.g. force certain parameters to be included)
# once happy with model, create training DTM as above,
# then predict using final model
# do not peak until model development complete!
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment