CSJCampbell · November 10, 2017 09:32
diff --git a/Intro_Text_Mining.R b/Intro_Text_Mining.R

 ###########################################################
 # 
 # Introduction to Text Mining
 # ccampbell
 # 2017-11-09
 # Script to learn text mining basics
 # 
 ###########################################################

 ###########################################################
 # text mining
 ###########################################################

 library(readr)
 library(dplyr)
 library(SnowballC)
 library(stringr)
 library(text2vec)
 library(glmnet)


 ###########################################################
 # business question 
 #  - which treatment types are poorly reimbursed by Medicare

 ###########################################################
 # actions 
 #  - advice for staff to prefer certain equivalent treatments with higher reimbursement
 #  - focus cost savings on poorly reimbursed treatments 

 ###########################################################
 # analytic question
 #  - does billing more than 2 * allowed amount 
 #    increase the proportion of allowed amount paid
 #  - which treatment keywords are have a high/low proportion funded


 # Obtain data

 # https://data.cms.gov/Medicare-Physician-Supplier/Medicare-Provider-Utilization-and-Payment-Data-Phy/sk9b-znav/data
 # downsampled data
 costs <- read_csv("http://bit.ly/MedicareManR17")

 dim(costs)

 # small numbers for funded_prop did not get much of their costs reimbursed
 costs <- costs %>% 
    mutate(large_bill = `Average Submitted Charge Amount` > (2 * `Average Medicare Allowed Amount`), 
        funded_prop = `Average Medicare Payment Amount` / `Average Submitted Charge Amount`)
 costs %>%
    group_by(large_bill) %>%
    summarize(
        n = n(),
        av = mean(funded_prop[is.finite(funded_prop)]), 
        med = median(funded_prop), 
        q05 = quantile(funded_prop, probs = 0.05),
        q95 = quantile(funded_prop, probs = 0.95))

 hist(log(costs$funded_prop))
 costs <- filter(costs, is.finite(funded_prop)) 
 # train and test
 set.seed(62877)
 isTest <- as.logical(rbinom(n = nrow(costs), size = 1, prob = 0.3))
 test <- filter(costs, isTest)
 train <- filter(costs, !isTest)
 dim(train)

 # Extract data
 # frameworks:
 # * tm - create document corpus
 # * tidytext
 # * text2vec - process data frames directly

 # preprocess text
 sample(x = train$`HCPCS Description`, size = 3)

 # stemming
 # reduce number of terms
 wordStem(c("treats", "treatments", "treating", "tree", "treat", "treated"))
 table(wordStem(c("treats", "treatments", "treating", "tree", "treat", "treated")))

 # numeric function creates numeric vector
 numVec <- numeric(length = 3)
 numVec
 mode(numVec)
 class(numVec)

 # call function on vec
 mean(numVec)
 # basic data type
 mode(mean)
 # special label
 class(mean)

 # function function creates function
 # assign to name, arguments are arguments function will have
 # following expression is algorithm to be applied to arguments
 addFun <- function(a, b) {
    a + b
 }
 addFun(a = 1, b = 2)
 addFun(a = 1, b = 2:6)

 # preprocessing generally requires analysis specific preprocessor
 # e.g. clean using regular expressions
 ?regex

 # an example preprocessor

 # roxygen2 style markup
 #' @title Custom text preprocessor
 #' @description removes punctuation
 #' @param stri character vector
 #' @return character vector
 #' @examples 
 #' prep(c("Hospital observation care typically 50 minutes",
 #'   "Initial hospital inpatient care, typically 30 minutes per day"))

 prep <- function(stri) {
    # replacing punctuation
    stri <- str_replace_all(string = stri, 
        pattern = "[[:punct:]]", 
        replacement = " ")
    # unify case
    stri <- tolower(stri)
    # consolidate number/days
    # dataset is regularized, so these are common patterns
    stri <- str_replace_all(string = stri, 
        pattern = "[0-9,\\.]+ minute(s|) per day", replacement = "daily_mins_")
    stri <- str_replace_all(string = stri, 
        pattern = "[0-9,\\.]+ minute(s|)", replacement = "mins_")
    # break each element into vector of words
    stri <- str_split(string = stri, pattern = " +")
    # stem each word, then stick back into sentances
    stri <- vapply(stri, 
        FUN = function(x) { paste(wordStem(x), collapse = " ") }, 
        FUN.VALUE = character(1))
    return(unlist(stri))
 }

 # check behaviour
 # unit test e.g. testthat allows you to automate 
 # comparing function output with expected output
 prep(c("Hospital observation care typically 50 minutes",
    "Initial hospital inpatient care, typically 30 minutes per day"))


 ###########################################################
 # text2vec

 # create iterator 
 # performs tokenization (split sentances to words)
 itTrain <- itoken(train$`HCPCS Description`, 
    preprocessor = prep,
    tokenizer = word_tokenizer, 
    progressbar = FALSE)
 # create vocabulary
 vocab <- create_vocabulary(itTrain, 
    stopwords = c("typically", "or", "of", "and"))
 # create vectorizer
 vectorizer <- vocab_vectorizer(vocab)
 # create document text matrix
 dtmTrain <- create_dtm(itTrain, 
    vectorizer = vectorizer)
 dim(dtmTrain)
 # sparse matrix
 # takes up less memory than nrow * ncol
 class(dtmTrain)
 dtmTrain[1:6, 1:5]

 # every column is unique word
 head(colnames(dtmTrain))

 # word occurs once in one record
 dtmTrain[dtmTrain[, "fulvestr"] != 0, "fulvestr"]

 ###########################################################
 # glmnet

 # create classifier
 glmnetClassifier <- glmnet(
    x = dtmTrain, 
    # dependent variable
    y = train$funded_prop, 
    # describes distribution of response
    family = "gaussian")

 # glmnet object
 names(glmnetClassifier)
 # glmnet fits model with a vector of penalization parameters
 # increasing the penalty reduces the number of parameters included
 # use cv.glmnet to find optimal penalty
 glmnetClassifier$a0

 # glmnet models have parameter values dependent on the 
 # penalization parameter
 plot(glmnetClassifier)

 # model coefficients for specified penalization
 coef3 <- coef(glmnetClassifier, s = 4)
 # which parameters are non-zero
 coef3[coef3[, 1] != 0, , drop = FALSE]
 # self predictions
 train <- mutate(train, 
    glmnetPreds = predict(glmnetClassifier, newx = dtmTrain, s = 4))

 # assess fit

 # Dependent variable ~ Predictions
 #
 # data points should be should be closely 
 # and uniformly distributed around line of unity

 plot(funded_prop ~ glmnetPreds, 
    data = train)
 abline(coef = c(0, 1), lty = 2)

 # Residuals ~ Predictions
 # 
 # The residuals should be scattered uniformly around 0. 

 plot(funded_prop - glmnetPreds ~ glmnetPreds, 
    data = train)
 abline(coef = c(0, 0), lty = 2)

 # not ideal, but still maybe useful
 # next steps, 
 # * experiment with changing preprocessing
 # * experiment with tuning parameters 
 #   (e.g. force certain parameters to be included)

 # once happy with model, create training DTM as above, 
 # then predict using final model
 # do not peak until model development complete!

	###########################################################
	#
	# Introduction to Text Mining
	# ccampbell
	# 2017-11-09
	# Script to learn text mining basics
	#
	###########################################################

	###########################################################
	# text mining
	###########################################################

	library(readr)
	library(dplyr)
	library(SnowballC)
	library(stringr)
	library(text2vec)
	library(glmnet)


	###########################################################
	# business question
	# - which treatment types are poorly reimbursed by Medicare

	###########################################################
	# actions
	# - advice for staff to prefer certain equivalent treatments with higher reimbursement
	# - focus cost savings on poorly reimbursed treatments

	###########################################################
	# analytic question
	# - does billing more than 2 * allowed amount
	# increase the proportion of allowed amount paid
	# - which treatment keywords are have a high/low proportion funded


	# Obtain data

	# https://data.cms.gov/Medicare-Physician-Supplier/Medicare-Provider-Utilization-and-Payment-Data-Phy/sk9b-znav/data
	# downsampled data
	costs <- read_csv("http://bit.ly/MedicareManR17")

	dim(costs)

	# small numbers for funded_prop did not get much of their costs reimbursed
	costs <- costs %>%
	mutate(large_bill = `Average Submitted Charge Amount` > (2 * `Average Medicare Allowed Amount`),
	funded_prop = `Average Medicare Payment Amount` / `Average Submitted Charge Amount`)
	costs %>%
	group_by(large_bill) %>%
	summarize(
	n = n(),
	av = mean(funded_prop[is.finite(funded_prop)]),
	med = median(funded_prop),
	q05 = quantile(funded_prop, probs = 0.05),
	q95 = quantile(funded_prop, probs = 0.95))

	hist(log(costs$funded_prop))
	costs <- filter(costs, is.finite(funded_prop))
	# train and test
	set.seed(62877)
	isTest <- as.logical(rbinom(n = nrow(costs), size = 1, prob = 0.3))
	test <- filter(costs, isTest)
	train <- filter(costs, !isTest)
	dim(train)

	# Extract data
	# frameworks:
	# * tm - create document corpus
	# * tidytext
	# * text2vec - process data frames directly

	# preprocess text
	sample(x = train$`HCPCS Description`, size = 3)

	# stemming
	# reduce number of terms
	wordStem(c("treats", "treatments", "treating", "tree", "treat", "treated"))
	table(wordStem(c("treats", "treatments", "treating", "tree", "treat", "treated")))

	# numeric function creates numeric vector
	numVec <- numeric(length = 3)
	numVec
	mode(numVec)
	class(numVec)

	# call function on vec
	mean(numVec)
	# basic data type
	mode(mean)
	# special label
	class(mean)

	# function function creates function
	# assign to name, arguments are arguments function will have
	# following expression is algorithm to be applied to arguments
	addFun <- function(a, b) {
	a + b
	}
	addFun(a = 1, b = 2)
	addFun(a = 1, b = 2:6)

	# preprocessing generally requires analysis specific preprocessor
	# e.g. clean using regular expressions
	?regex

	# an example preprocessor

	# roxygen2 style markup
	#' @title Custom text preprocessor
	#' @description removes punctuation
	#' @param stri character vector
	#' @return character vector
	#' @examples
	#' prep(c("Hospital observation care typically 50 minutes",
	#' "Initial hospital inpatient care, typically 30 minutes per day"))

	prep <- function(stri) {
	# replacing punctuation
	stri <- str_replace_all(string = stri,
	pattern = "[[:punct:]]",
	replacement = " ")
	# unify case
	stri <- tolower(stri)
	# consolidate number/days
	# dataset is regularized, so these are common patterns
	stri <- str_replace_all(string = stri,
	pattern = "[0-9,\\.]+ minute(s\|) per day", replacement = "daily_mins_")
	stri <- str_replace_all(string = stri,
	pattern = "[0-9,\\.]+ minute(s\|)", replacement = "mins_")
	# break each element into vector of words
	stri <- str_split(string = stri, pattern = " +")
	# stem each word, then stick back into sentances
	stri <- vapply(stri,
	FUN = function(x) { paste(wordStem(x), collapse = " ") },
	FUN.VALUE = character(1))
	return(unlist(stri))
	}

	# check behaviour
	# unit test e.g. testthat allows you to automate
	# comparing function output with expected output
	prep(c("Hospital observation care typically 50 minutes",
	"Initial hospital inpatient care, typically 30 minutes per day"))


	###########################################################
	# text2vec

	# create iterator
	# performs tokenization (split sentances to words)
	itTrain <- itoken(train$`HCPCS Description`,
	preprocessor = prep,
	tokenizer = word_tokenizer,
	progressbar = FALSE)
	# create vocabulary
	vocab <- create_vocabulary(itTrain,
	stopwords = c("typically", "or", "of", "and"))
	# create vectorizer
	vectorizer <- vocab_vectorizer(vocab)
	# create document text matrix
	dtmTrain <- create_dtm(itTrain,
	vectorizer = vectorizer)
	dim(dtmTrain)
	# sparse matrix
	# takes up less memory than nrow * ncol
	class(dtmTrain)
	dtmTrain[1:6, 1:5]

	# every column is unique word
	head(colnames(dtmTrain))

	# word occurs once in one record
	dtmTrain[dtmTrain[, "fulvestr"] != 0, "fulvestr"]

	###########################################################
	# glmnet

	# create classifier
	glmnetClassifier <- glmnet(
	x = dtmTrain,
	# dependent variable
	y = train$funded_prop,
	# describes distribution of response
	family = "gaussian")

	# glmnet object
	names(glmnetClassifier)
	# glmnet fits model with a vector of penalization parameters
	# increasing the penalty reduces the number of parameters included
	# use cv.glmnet to find optimal penalty
	glmnetClassifier$a0

	# glmnet models have parameter values dependent on the
	# penalization parameter
	plot(glmnetClassifier)

	# model coefficients for specified penalization
	coef3 <- coef(glmnetClassifier, s = 4)
	# which parameters are non-zero
	coef3[coef3[, 1] != 0, , drop = FALSE]
	# self predictions
	train <- mutate(train,
	glmnetPreds = predict(glmnetClassifier, newx = dtmTrain, s = 4))

	# assess fit

	# Dependent variable ~ Predictions
	#
	# data points should be should be closely
	# and uniformly distributed around line of unity

	plot(funded_prop ~ glmnetPreds,
	data = train)
	abline(coef = c(0, 1), lty = 2)

	# Residuals ~ Predictions
	#
	# The residuals should be scattered uniformly around 0.

	plot(funded_prop - glmnetPreds ~ glmnetPreds,
	data = train)
	abline(coef = c(0, 0), lty = 2)

	# not ideal, but still maybe useful
	# next steps,
	# * experiment with changing preprocessing
	# * experiment with tuning parameters
	# (e.g. force certain parameters to be included)

	# once happy with model, create training DTM as above,
	# then predict using final model
	# do not peak until model development complete!