jknowles · September 16, 2018 17:24
diff --git a/advanced_syntax.R b/advanced_syntax.R
 ###############################################################################
 ## SDP Fall Workshop Predictive Analytics
 ## Advanced / Additional Code Snippets for Working with PA Data and Models
 ## Author: Jared E. Knowles
 ## Date: 09/14/2018
 ## You do not need to use all or even any of this code. The code does not need to 
 ## be run together. This is just a survey of some additional techniques/tricks you 
 ## can do in R to make explaining predictive models and complex data easier. 
 ## As always - your needs and approaches may different. 
 ################################################################################
 # Load the data
 load("data/montucky.rda")


 ########################################################################
 # Build and plot a regression tree
 ########################################################################
 library(rpart.plot)
 library(rpart)

 # Basic partition tree model to plot
 binary.model <- rpart(ontime_grad ~ scale_score_7_math +  
                       pct_days_absent_7 + ell_7 + iep_7 + frpl_7 + male + 
                       race_ethnicity + 
                       sch_g7_lep_per + sch_g7_gifted_per, 
                     data = sea_data, 
                     control = rpart.control(cp = .005))

 # Pass the model object to the plot function
 # You can look at ?rpart.plot to identify customizations you wish to use
 rpart.plot(binary.model)

 # To improve the labels, you can rename the variables in the data
 sea_data$short_race_label <- NA
 sea_data$short_race_label[sea_data$race_ethnicity == "Hispan..."] <- "H"
 sea_data$short_race_label[sea_data$race_ethnicity == "White"] <- "W"
 sea_data$short_race_label[sea_data$race_ethnicity == "Asian"] <- "A"
 sea_data$short_race_label[sea_data$race_ethnicity == "Americ..."] <- "AI"
 sea_data$short_race_label[sea_data$race_ethnicity == "Demogg..."] <- "M"
 sea_data$short_race_label[sea_data$race_ethnicity == "Native..."] <- "HA"
 sea_data$short_race_label[sea_data$race_ethnicity == "Black ..."] <- "B"

 # Fit a classification model, just like a logistic regression formula interface
 binary.model <- rpart(ontime_grad ~ scale_score_7_math +  
                        pct_days_absent_7 + ell_7 + iep_7 + frpl_7 + male + 
                        short_race_label + 
                        sch_g7_lep_per + sch_g7_gifted_per, 
                      data = sea_data, 
                      control = rpart.control(cp = .005))

 # Make a plot showing the tree
 rpart.plot(binary.model)

 ########################################################################
 ## Variable importance from a caret/train model object
 ########################################################################
 library(caret)

 ## Fit a simple model just to show the syntax, you should be fitting a model 
 ## split on train and test data, substitute your own model here. 

 # Unfortunately, we have to remove NAs first, because 
 # caret won't do it for us
 train_data <- na.omit(sea_data[, c("ontime_grad", "scale_score_7_math", 
                                   "pct_days_absent_7", "ell_7", "iep_7", 
                                   "male", "frpl_7", "race_ethnicity", 
                                   "sch_g7_lep_per", "sch_g7_gifted_per")])

 # Outcome must be a factor in R to work with caret, so we recode
 # Avoid 0/1 factor labels - this causes problems in fitting many model types. 
 train_data$ontime_grad <- ifelse(train_data$ontime_grad == 1, "Grad", "Nongrad")
 train_data$ontime_grad <- factor(train_data$ontime_grad)

 # Fit the model
 caret_mod <- train(ontime_grad ~ ., 
                   method = "rpart", 
                   data = train_data, 
                   metric = "AUC", 
                   trControl = trainControl(summaryFunction = prSummary, 
                                            classProbs = TRUE, 
                                            method = "cv"))

 # Get variable importance, not available for all methods 
 caret::varImp(caret_mod)

 # Plot variable importance
 plot(caret::varImp(caret_mod))


 ########################################################################
 ## Probablity accuracy
 ## Plot the relationship between predicted probabilities and observed 
 ## graduation rates to explore thresholds. 
 ########################################################################
 library(yardstick)
 library(dplyr)

 # Get the predicted classifications from the caret model above
 class_vec <- predict(caret_mod, newdata = train_data)
 # Get the predicted probabilities from the model above
 ## Note that caret insists on giving probabilities for both classes, we 
 ## need to store only one, in this case, the first one
 prob_vec <- predict(caret_mod, newdata = train_data, type = "prob")[[1]] # only need first column

 # Combine the true values, the estimated class, and the class probability
 # into one dataframe for plotting
 estimates_tbl <- data.frame(
  truth = as.factor(train_data$ontime_grad),
  estimate = as.factor(class_vec),
  class_prob = prob_vec
 )

 # Using this struture we can use the `yardstick` package to nicely 
 # compute a variety of performanc emetrics
 ## Confusion matrix
 estimates_tbl %>% yardstick::conf_mat(truth, estimate)
 # Accuracy
 estimates_tbl %>% yardstick::metrics(truth, estimate)
 # AUC
 estimates_tbl %>% yardstick::roc_auc(truth, class_prob) 

 # ROC graph
 # Plots the ROC curve (the ROC at all possible threshold values for the 
 # probability cutoff)

 library(pROC)
 roc(estimates_tbl$truth, estimates_tbl$class_prob) %>% plot

 # Save the confusion matrix as an R object
 conf_mat <- estimates_tbl %>% yardstick::conf_mat(truth, estimate)

 # Plot the confusion matrix if you like
 library(vcd)
 labs <- round(prop.table(conf_mat$table), 2)
 # Can change the margin to change the labels
 mosaic(conf_mat$table, pop=FALSE)
 labeling_cells(text = labs, margin = 0)(conf_mat$table)

 ################################################################################
 ## Probability vs. outcome plot for predicted probabilities
 ################################################################################

 plotdf <- estimates_tbl %>% 
  mutate(prob_cut = percent_rank(class_prob)) %>% 
  # depending on how many unique probabilities your model produces you can try
  # mutate(prob_cut = ntile(class_prob, 50)) %>% 
  group_by(prob_cut) %>% 
  summarize(
    avg_prob = mean(class_prob), 
    prob_grad = sum(truth == "Grad") / length(truth),
    count = n())


 library(ggplot2)
 library(ggalt) # for fun lollipop charts

 ggplot(plotdf, aes(x = avg_prob, y = prob_grad)) + 
  ggalt::geom_lollipop() + theme_classic() + 
  coord_cartesian(xlim = c(0, 1), ylim = c(0, 1), expand=FALSE) + 
  geom_smooth(se=FALSE)

 ################################################################################
 ## Advanced: Make a Bowers plot
 #################################################################################
 # Get the data for comparable points
 url <- "https://raw.githubusercontent.com/jknowles/DEWSatDoGoodData2016/master/data/BowersEWSReviewData.csv"
 ews <- read.csv(url)

 # Clean up the coding of the data with better labels
 # This is optional, but it highlights some reference 
 # models 
 ews$flag <- "Other EWI"
 ews$flag[ews$id==1 | ews$id==2] <- "Chicago On-Track"
 ews$flag[ews$id > 3 & ews$id <14] <- "Balfanz ABC"
 ews$flag[ews$id ==85] <- "Muth?n Math GMM"
 ews$flag[ews$id==19] <- "Bowers GPA GMM"
 ews$flag <- factor(ews$flag)
 ews$flag <- relevel(ews$flag, ref="Other EWI")
 mycol <- c("Other EWI" = "gray70", "Chicago On-Track" = "blue", 
           "Balfanz ABC" = "purple", 
           "Muth?n Math GMM" = "orange", 
           "Bowers GPA GMM" = "dark red")

 # The big block of plotting code, you only need to modify the last 
 # couple of lines, the rest adds annotations and labels which you 
 # can switch on and off if you like

 ggplot(ews) + aes(x = 1-specificity, y = sensitivity, 
                shape = flag, size = I(4), color = flag) + 
         geom_point() + 
  # Label the shape legend
  scale_shape("EWI Type") +
  # Label the color legend and customize it
  scale_color_manual("EWI Type", values = mycol) +
  # Add a reference 45 degree line representing random chance
  geom_abline(intercept = 0, slope = 1, linetype = 2) + 
  # Set the scales of the chart to not distort distances
  coord_cartesian(xlim = c(0, 1), ylim = c(0, 1)) + 
  # Add a simple theme to avoid clutter
  theme_bw() +
  # Label the axes
  labs(x="False Alarm Proportion", y="True Positive Proportion",
       title = "ROC Accuracy of Early Warning Indicators") + 
  # Place the legend in the bottom right
  theme(legend.position = c(0.8, 0.2), 
        legend.background = element_rect(fill = NULL, 
                                         color = "black")) + 
  # Add arrows to annotate the plot
  annotate(geom="segment", x = 0.55, y = 0.625, 
           yend = 0.785, xend = 0.4, 
           arrow = arrow(length = unit(0.5, "cm"))) + 
  # Label the arrow
  annotate(geom="text", x = .365, y = .81, label="Better Prediction") +
  annotate(geom="segment", x = 0.65, y = 0.625, yend = 0.5, 
           xend = 0.75, arrow = arrow(length = unit(0.5, "cm"))) + 
  annotate(geom="text", x = .75, y = .48, label="Worse Prediction") +
  annotate(geom="text", x = .75, y = .78, angle = 37, label="Random Guess")  +
  # Add your annotation here:
  annotate(geom="point", 
           x = 1-estimates_tbl %>% yardstick::spec(truth, estimate), 
           y = estimates_tbl %>% yardstick::sens(truth, estimate), 
           size = 5, color = "red") 

 # Use a custom probability threshold

diff --git a/pa_syntax_guide_r.Rmd b/pa_syntax_guide_r.Rmd
 ---
 title: "Predictive Analytics Tutorial"
 author: "OpenSDP"
 date: "September 9, 2017"
 output: 
  word_document:
    reference_docx: reference.docx
    toc: true
 ---

 ```{r setup, include=FALSE}
 knitr::opts_chunk$set(echo = TRUE, warning=FALSE, message=FALSE, error = FALSE, 
                      results = "hide", fig.show="hide")
 ```

 ## Introduction

 This file provides guidance and R syntax examples for the hands-on
 predictive analytics session during the Fall 2018 Cohort 9 Strategic Data
 Project Workshop in Denver.

 During the workshop, we'll ask you work with stakeholders to develop a dropout
 early warning system for 7th grade students for a simulated state education
 agency (the state of Montucky) using student data. This guide will provide you
 with a baseline model to get you started and introduce you to the main concepts
 of predictive anlaytics. From there, you will work together with stakeholders to
 incorporate their values into the model, secure their support and adoption of
 the model, and practice an inclusive and iterative design process to building
 analytics.

 As a data analyst, your role in this process is to measure the accuracy of the 
 model and communicate the tradeoffs of different models in terms of accuracy 
 and usability. Together with stakeholders, you might find that the most accurate 
 model does not reflect the values of the end the users. Perhaps the most accurate 
 model requires variables not available for all students, or predicts all students 
 in certain categories to be at-risk. These are issues you will have to work 
 together with stakeholders to address. 

 Logistic regression is one tool you can use, and we'll demonstrate it here.
 There are many other techniques of increasing complexity. (Many of the best
 predictive analytics packages are written in the R programming language.) But
 for a binary outcome variable, most data scientists start with logistic
 regressions, and those are very straightforward to do in R.

 Here are the steps:

 1. explore the data, especially college enrollment predictors and outcomes 
 2. examine the relationship between predictors and outcomes 
 3. evaluate the predictive power of different variables and select predictors for your model 
 4. make predictions using logistic regression 
 5. convert the predicted probabilities into a 0/1 indicator 
 6. look at the effect of different probability cutoffs on prediction accuracy (develop a "confusion matrix") 

 This guide will take you through these steps using a baseline model for predicting 
 graduation using student grade 7 data. During the workshop you will iterate on 
 this baseline model using the process above and find an improved model that 
 meets stakeholder needs. 

 The commands in this script won't tell you everything you need to do to develop 
 your model, but they will give you command
 syntax that you should be able to adjust and adapt to get the project done.

 You should also consider non-model approaches like the "checklist" approaches
 described in the pre-reading created at the Chicago Consortium of School
 Research (CCSR). With that "checklist" approach, you experiment with different
 thresholds for your predictor variables, and combine them to directly predict
 the outcome. This approach has the advantage of being easy to explain and
 implement, but it might not yield the most accurate predictions. We won't
 demonstrate that approach here, but if you want to try it you can draw on the
 syntax examples here to develop it. 

 Before you get started, you need to think about variables, time, and datasets.
 The sooner in a student's academic trajectory you can make a prediction, the
 sooner you can intervene--but the less accurate your predictions, and hence your
 intervention targeting, is likely to be. What data, and specifically which
 variables, do you have available to make predictions? What outcome are you
 trying to predict?

 In the case of the workshop, we are limiting our focus to using data available
 at the end of grade 7 to predict outcomes at the end of high school. A critical
 step in developing a predictive model is to identify the time points that
 different measures are collected during the process you are predicting -- you
 can't use data fromthe future to make predictions. If you're planning to use
 your model to make predictions for students at the end of 11th grade, for
 instance, and if most students take AP classes as seniors, you can't use data
 about AP coursetaking collected during senior year to predict the likelihood of
 college enrollment, even if you have that data available for past groups of
 students.

 In terms of datasets, you can develop your model and then test its accuracy on
 the dataset you used to develop the model, but that is bad practice--in the real
 world, your model is only as good as its predictions on different, out of sample
 datasets. It's good practice to split your data into three parts: one part for
 developing your model, one for repeatedly testing different versions of your
 model, and a third to use for a final out of sample test.

 We're using multiple cohorts of middle-school students for the predictive analytics
 task--students who were seventh graders between 2007 and in 2011. These are the 
 cohorts for which we have access to reliable information on their high school 
 graduation status (late graduate, on time graduate, dropout, transferout, 
 disappeared). For this workshop you are being given the entire set of data so 
 that you can explore different ways of organizing the data across cohorts. 
 An example is that you may choose to explore your models and data using the 
 earlier cohort, and evaluate their performance on more recent cohorts. 

 One last point -- even though the data is synthetic, we have simulated missing data 
 for you. In the real world, you'll need to make predictions for every
 student, even if you're missing data for that student which your model needs in
 order to run. Just making predictions using a logistic regression won't be
 enough. You'll need to use decision rules based on good data exploration and
 your best judgment to predict and fill in outcomes for students where you have
 insufficient data.

 ## Getting Started

 If you're using the `Rmd` file version of these materials, start by saving a new
 version of the file, so you can edit it without worrying about overwriting the
 original. Then work through the file inRStudio by highlighting one or a few
 command lines at a time, clicking the "execute" icon (or pressing
 control-enter), and then looking at the results in the R console. Edit or add
 commands as you wish.  If you're using a paper or PDF version of these
 materials, just read on--the R output appears below each section of commands.

 This guide is built using the data you will be working with on the workshop. 
 This dataset includes simulated data for multiple cohorts of 7th graders along 
 with their corresponding high school outcomes. Each observation (row) is a student, 
 and includes associated information about the student's demographics, academic 
 performance in 7th grade, associated school and district (in grade 7), last 
 high school attended, and high school completion. 

 To work through this script, you need to put the
 `montucky.csv` data file on your computer in a working folder of
 your choice, and then edit the commands below to
 tell R where to look for the data. If you have trouble doing this, ask for
 help from other members of your group.


 ## Setup

 To prepare for this project you will need to ensure that your R installation 
 has the necessary add-on packages and that you can read in the training data. 

 ```{r installPackages, eval=FALSE}
 # Install add-on packages needed
 install.packages("dplyr") # this will update your installed version to align with 
 install.packages("pROC") # those in the tutorial 
 install.packages("devtools")
 install.packages("caret") # for machine learning
 install.packages("future") # for multicore processing on Windows/Mac/Linux
 ```

 ```{r loadWorkspace}
 # Load the packages you need
 library(dplyr)
 library(pROC)
 library(devtools)
 library(future)

 # Load the helper functions not in packages
 devtools::source_gist("ed47cd156462a9900df1f77a000f4a52", 
            filename = "helper_funcs.R")

 # Read in the data
 # This command assumes that the data is in a folder called data, below your 
 # current working directory. You can check your working directory with the 
 # getwd() command, and you can set your working directory using the RStudio 
 # environment, or the setwd() command.

 load("data/montucky.rda")
 ```

 ### Validate the data

 Ensure that the data imported correctly. 

 First, ensure that the data is unique by student ID.

 ```{r}
 nrow(sea_data) == n_distinct(sea_data$sid)
 ```

 Wait, what is the issue here? 

 ```{r}
 table(sea_data$sid == sea_data$sid[[1]]) # test how many times the first sid appears
 ```

 Why might our IDs be repeated 45 times? Let's look at how many LEAs we have in 
 our SEA dataset: 

 ```{r}
 length(unique(sea_data$sch_g7_lea_id)) # test how many LEAs are in our data
 ```

 We see that our student IDs are not unique by LEA. That's an easy enough 
 fix. 

 ```{r}
 nrow(sea_data) == n_distinct(sea_data$sid, sea_data$sch_g7_lea_id)
 ```

 Let's append the LEA ID onto the student ID to make student IDs truly unique:

 ```{r}
 sea_data$sid <- paste(sea_data$sid, sea_data$sch_g7_lea_id, sep = "-")
 ```

 ```{r}
 nrow(sea_data) == n_distinct(sea_data$sid)
 ```


 ## Explore the Data

 A key initial task in building an EWS is to identify the cohort membership of 
 students. When we build a predictive model need to identify two timepoints - 
 when we will be making the prediction, and when the outcome we are predicting 
 will be observed. In this case, we will be making the prediction upon receiving 
 the 7th grade data on students (so the completion of 7th grade), and we will 
 be predicting their completion of high school. 

 Let's focus first on identifying the 7th grade year for each student. We have 
 three year variables, what is their relationship:


 ```{r}
 table(sea_data$year)
 table(sea_data$cohort_year)
 table(sea_data$cohort_grad_year)
 ```

 From the data dictionary we know that the first year variable is the year 
 that corresponds with the student entering 7th grade (`year`). The `cohort_year` 
 variable defines the 9th grade cohort a student belongs to for measuring ontime 
 graduation. Finally, the `cohort_grad_year` is the year that the cohort a 
 student is a member of should graduate to be considered "on-time". 

 If a student graduates, their year of graduation is recorded as `year_of_graduation`. 

 The definition of graduation types is an important design decision for our 
 predictive analytic system. We have to set a time by which students are graduated 
 so we know whether to count them as graduating or not completing. The state of 
 Montucky uses a 4-year cohort graduation rate for most reporting, defining 
 ontime graduation as graduation within 4 years of entering high school. This 
 variable is defined as: 

 ```{r}
 table(sea_data$year_of_graduation == sea_data$cohort_grad_year)
 table(sea_data$ontime_grad)
 ```

 This is an example of a business rule - it's a restriction on the definition 
 of the data we make so that we can consistently use the data. In this case, it 
 is necessary so we can definitively group students for the purposes of predicting 
 their outcomes. You could consider alternative graduation timelines, for example: 

 ```{r}
 table(sea_data$year_of_graduation <= sea_data$cohort_grad_year + 1)
 ```

 What does this rule say? How is it different than the definition of on-time above?

 Now that we have a sense of the time structure of the data, let's look at 
 geography. How many high schools and how many districts are? What are those 
 regional education services coops?

 We are going to be building a model for the an entire state, but stakeholders 
 may have questions about how the model works for particular schools, districts, 
 or regions. Let's practice exploring the data by these differerent geographies. 


 ```{r}
 length(unique(sea_data$first_hs_name))
 length(unique(sea_data$first_hs_lea_id))
 table(sea_data$coop_name_g7, useNA = "always")
 ```

 For this exercise, districts in Montucky are organized into cooperative regions.
 Cooperative regions are just groups of LEAs. It may be helpful to compare how
 our model performs in a given school, LEA, or coop region to the rest of the
 data in the state. As an example of this kind of analysis, select a specific
 coop region and explore its data, drawing comparisons to the full dataset. Which
 districts are part of this coop region and how many students do they have?
 Substitute different abbreviation codes for different coops and then replace the
 `my_coop` variable below.

 ```{r}
 my_coop <- sea_data$coop_name_g7[50] # select the coop for the 50th observation
 # Which districts are in this coop and how many 7th graders do we have for each?
 table(sea_data$sch_g7_lea_id[sea_data$coop_name_g7 == my_coop], 
      useNA = "always")
 # which schools?
 table(sea_data$sch_g7_name[sea_data$coop_name_g7 == my_coop], 
      useNA = "always")
 ```

 What student subgroups are we interested in? Let's start by looking at student subgroups. 
 Here's whether a student is male.

 ```{r}
 table(sea_data$male, useNA="always")
 ```

 Here's a short for loop to look at one-way tabs of a lot of variables at once.

 ```{r, eval=FALSE}
 for(i in c("male", "race_ethnicity", "frpl_7", "iep_7", "ell_7", 
           "gifted_7")){
  print(i)
  print(table(sea_data[, i], useNA="always"))
 }
 ```

 Let's examine the distribution of student subgroups by geography. For this
 command, we'll use the same looping syntax from above, which lets you avoid
 repetition by applying commands to multiple variables at once. You can type
 `?for` into the R console if you want to learn more about how to use loops in R.

 ```{r, eval=FALSE}
 for(var in c("male", "race_ethnicity", "frpl_7", "iep_7", "ell_7", 
           "gifted_7")){
  print(var)
  print( # have to call print inside a loop
    round( # round the result
      prop.table( # convert table to percentages
        table(sea_data$coop_name_g7, sea_data[, var],  # build the table
                           useNA = "always"), 
        margin = 1), # calculate percentages by column, change to 1 for row
      digits = 3) # round off at 3 digits
    *100 ) # put on percentage instead of proportion scale
 }
 ```

 Now, let's look at high school outcomes. We won't examine them all, but you should. 
 Here's the ontime high school graduation outcome variable we looked at above:

 ```{r}
 table(sea_data$ontime_grad, useNA = "always")
 ```

 Wait! What if the data includes students who transferred out of state? That
 might bias the graduation rate and make it too low, because those 7th graders
 might show up as having dropped out.

 ```{r}
 table(sea_data$transferout, useNA = "always")
 table(transfer = sea_data$transferout, grad = sea_data$ontime_grad, useNA = "always")
 ```

 This is another case where we may want to consider a business rule. How should 
 students who transfer out be treated? We don't know whether they graduated 
 or not. Should they be excluded from the analysis? Coded as not completing? 
 The decision is yours, but it is important to consider all the possible high 
 school outcomes when building a model and how the model will treat them. 

 Let's look at the distribution of another outcome variable, `any_grad`, which 
 includes late graduation and ontime graduation by both geography and by
 subgroup.

 ```{r crosstab_grad, eval=FALSE}
 round(
  prop.table(
    table(sea_data$coop_name_g7, sea_data$any_grad, useNA="always"),
      margin = 1
    ), 
  digits = 2)

 for(var in c("male", "race_ethnicity", "frpl_7", "iep_7", "ell_7", 
           "gifted_7")){
  print(var)
  print(
    prop.table(
      table(grad = sea_data$any_grad, var = sea_data[, var], 
            useNA = "always"), 
      margin = 1)
  )
 }

 ```


 ### Review Existing Indicator

 Now that we are oriented to the data, let's turn our attention to the model 
 predictions provided by the vendor. First, let's check the format:

 ```{r coll_ready_tab}
 summary(sea_data$vendor_ews_score)
 ```

 Instead of classifying students, each student receives a predicted probability 
 for graduating. We need a way to convert this into a classification measure. 
 One way to do this is to pick a threshold and declare all values greater than 
 that threshold as being of the positive (graduation) class, and all values 
 below as being members of the negative class. 


 ```{r}
 set_thresh <- 0.5

 conf_count <- table(observed = sea_data$ontime_grad, 
      pred = sea_data$vendor_ews_score > set_thresh)
 conf_count
 prop.table(conf_count)
 ```

 This matrix tells us, for the threshold we have selected, how many graduates 
 we identified successfully, how many non-graduates we identified successfully, 
 and how many mistakes we made in each direction (this is known as a confusion 
 matrix and will be discussed at length at the workshop!). 

 How does this compare with the vendor's marketing material and stated accuracy?

 Now, let's turn to identifying the predictors available for building an alternative 
 model. Let's examine the performance and behavioral variables that you can
 use as predictors. These are mostly numerical variables, so you should use the
 summary, histogram, and table commands to explore them. Here's some syntax for
 examining 7th grade math scores. You can replicate and edit it to examine other
 potential predictors and their distributions by different subgroups.

 ```{r}
 summary(sea_data$scale_score_7_math)
 hist(sea_data$scale_score_7_math)
 ```

 ```{r mean_score_by_demog}
 by(sea_data$scale_score_7_math, sea_data$coop_name_g7, FUN = mean, 
   na.rm=TRUE)

 by(sea_data$scale_score_7_math, sea_data$frpl_7, FUN = mean, 
   na.rm=TRUE)
 ```

 Finally, here's some sample code you can use to look at missingness patterns in
 the data. Note we use the `is.na()` function to test whether a value is missing.

 ```{r missingness_checks}
 for(var in c("coop_name_g7", "male", "race_ethnicity")){
  print(var)
  print(
  prop.table(table(sea_data[, var], 
                         "missing_math" = is.na(sea_data$pct_days_absent_7)), 1)
  )
 }

 ```

 Handling missing values is another case where business rules will come into play. 

 Did you see any outlier or impossible values while you were exploring the data?
 If so, you might want to truncate them or change them to missing. Here's how you
 can replace a numeric variable with a missing value if it is larger than a
 certain number (in this case, 100 percent).

 ```{r abs_trunc}
 hist(sea_data$pct_days_absent_7)
 sea_data$pct_days_absent_7[sea_data$pct_days_absent_7 > 100] <- NA
 hist(sea_data$pct_days_absent_7)
 ```

 Trimming the data in this way is another example of a business rule. You 
 may wish to trim the absences even further in this data. You may also wish to 
 assign a different value other than missing for unusual values - such as the 
 mean or median value. 

 Now that you've explored the data, you can start to examine the relationship
 between predictor and outcome variables. Here we'll continue to look at the high
 school graduation outcome, and we'll restrict the predictors to just two: 7th
 grade math scores and percent of enrolled days absent through 7th grade. For
 your model, you can of course use more and different predictor
 variables. First, check the correlation between outcome and predictors.

 ```{r}
 cor(sea_data[, c("ontime_grad", "scale_score_7_math", "pct_days_absent_7")], 
    use = "pairwise.complete.obs")
 ```

 These correlations do not look very promising! But remember, a correlation 
 may not tell the whole story if other factors are involved, and correlations 
 between binary variables and continuous variables are noisy indicators. 

 It would be nice to have a better idea of
 the overall relationship between outcomes and predictors. 

 But you can't make a meaningful scatterplot when the independent, or y value, is 
 a binary outcome variable (try it!). Let's look at a technique to identify 
 the relationship between a continuous variable and a binary outcome. 

 The idea behind this code is to show the mean of the outcome variable for each
 value of the predictor, or for categories of the predictor variable if it has
 too many values. First, define categories (in this case, round to the nearest 
 percentage) of the percent absent variable, and then truncate the variable so that
 low-frequency values are grouped together.

 ```{r}
 sea_data$pct_absent_cat <- round(sea_data$pct_days_absent_7, digits = 0)
 table(sea_data$pct_absent_cat)
 sea_data$pct_absent_cat[sea_data$pct_absenct_cat >= 30] <- 30
 ```
 	
 Next, define a variable which is the average ontime graduation rate for each 
 absence category, and then make a scatter plot of average graduation rates by 
 absence percent.

 ```{r}
 sea_data <- sea_data %>% 
  group_by(pct_absent_cat) %>% # perform the operation for each value 
  mutate(abs_ontime_grad = mean(ontime_grad, na.rm = TRUE)) # add a new variable

 plot(sea_data$pct_absent_cat, sea_data$abs_ontime_grad)
 ```

 You can do the same thing for 7th grade test scores. First look at the math 
 test score and notice that some scores appear to be outliers. 

 ```{r}
 hist(sea_data$scale_score_7_math)
 sea_data$scale_score_7_math[sea_data$scale_score_7_math < 0] <- NA
 hist(sea_data$scale_score_7_math)
 ```

 You can do the same plot as above now by modifying the `group_by()` 
 command.

 ```{r}
 sea_data <- sea_data %>% 
  mutate(math_7_cut = ntile(scale_score_7_math, n = 100)) %>%
  group_by(math_7_cut) %>% # perform the operation for each value 
  mutate(math_7_ontime_grad = mean(ontime_grad, na.rm=TRUE)) # add a new variable

 plot(sea_data$math_7_cut, sea_data$math_7_ontime_grad)

 ```

 This is a neat trick you can use to communicate your model predictions as well.

 ## Model

 Now we're ready to call on the `glm` command to compute a logistic regression
 for the relationship between our binary outcome variable and our predictor
 variables. When you run a logistic regression, R calculates the parameters of an
 equation that fits the relationship between the predictor variables and the
 outcome. A regression model typically won't be able to explain all of the
 variation in an outcome variable--any variation that is left over is treated as
 unexplained noise in the data, or error, even if there are additional variables
 not in the model which could explain more of the variation.

 Once you've run a logit regression, you can have R generate a variable with new,
 predicted outcomes for each observation in your data with the `predict` command.
 The predictions are calculated using the model equation and ignore the
 unexplained noise in the data. For logit regressions, the predicted outcomes
 take the form of a probability ranging 0 and 1. To start with, let's do a
 regession of ontime graduation on eighth grade math scores.


 ```{r}
 math_model <- glm(ontime_grad ~ scale_score_7_math, data = sea_data, 
                  family = "binomial") # family tells R we want to fit a logistic
 ```

 The default summary output for logistic regression in R is not very helpful. 

 ```{r}
 summary(math_model)
 ```

 Even before you use the predict command, you can use the logit output to learn
 something about the relationship between the predictor and the outcome variable.
 The Pseudo $R^{2}$ (read R-squared) is a proxy for the share of variation in the
 outcome variable that is explained by the predictor. Statisticians don't like it
 when you take the pseudo $R^{2}$ too seriously, but it can be useful in predictive
 exercises to quickly get a sense of the explanatory power of variables in a
 logit model.

 ```{r}
 logit_rsquared(math_model)
 ```

 Does adding polynomial terms increase the pseudo $R^{2}$? You can use the formula 
 interface in R to add functional transformations of predictors without generating 
 new variables and find out. 

 ```{r}
 math_model2 <- glm(ontime_grad ~ scale_score_7_math + 
                     I(scale_score_7_math^2) + I(scale_score_7_math^3), 
                   data = sea_data, 
                  family = "binomial") # family tells R we want to fit a logistic
 logit_rsquared(math_model2)
 ```

 The model did not improve very much.  Any time you add predictors to a model, 
 the $R^{2}$ will increase, even if the variables are fairly meaningless, so it's 
 best to focus on including predictors that add meaningful explanatory power.

 Now take a look at the $R^{2}$ for the absence variable. 

 ```{r}
 absence_model <- glm(ontime_grad ~ pct_days_absent_7, data = sea_data, 
                  family = "binomial") 
 summary(absence_model)
 logit_rsquared(absence_model)
 ```

 Let's combine our two predictors and test their combined power. 

 ```{r}
 combined_model <- glm(ontime_grad ~ pct_days_absent_7 + scale_score_7_math, 
                      data = sea_data, family = "binomial")
 summary(combined_model)
 logit_rsquared(combined_model)
 ```
 	
 Using this combined model, let's use the predict command to make our first 
 predictions. 

 ```{r}
 sea_data$grad_pred <- predict(combined_model, newdata = sea_data,
                   type = "response") # this tells R to give us a probability
 ```

 This generates a new variable with the probability of ontime high school
 graduation, according to the model. But if you look at the number of
 observations with predictions, you'll see that it is smaller than the total
 number of students. This is because R doesn't use observations that have
 missing data for any of the variables in the model.

 ```{r}
 table(is.na(sea_data$grad_pred))
 ```
 	
 Let's convert this probability to a 0/1 indicator for whether or not a student
 is likely to graduate ontime. A good rule of thumb when starting out is to set 
 the probability cutoff at the mean of the outcome variable. In this example, 
 we can store this value as a variable:

 ```{r}
 basic_thresh <- mean(sea_data$ontime_grad)
 basic_thresh
 ```


 If the probability in the model is equal to or
 greater than this threshold, we'll say the student is likely to graduate. 

 ```{r}
 sea_data$grad_indicator <- ifelse(sea_data$grad_pred > basic_thresh, 1, 0)
 table(sea_data$grad_indicator, useNA = "always")
 ```

 You can also plot the relationship between the probability and the outcome. 
 Ideally, you should see the proportion of graduates steadily increase for each 
 percentile of the probabilities. What does this relationship tell you?

 ```{r}
 sea_data <- sea_data %>% 
  mutate(grad_pred_cut = ntile(grad_pred, n = 100)) %>%
  group_by(grad_pred_cut) %>% # perform the operation for each value 
  mutate(grad_pred_cut_grad = mean(ontime_grad, na.rm=TRUE)) # add a new variable

 plot(sea_data$grad_pred_cut, sea_data$grad_pred_cut_grad)

 ```


 Lets evaluate the accuracy of the model by comparing the predictions to the
 actual graduation outcomes for the students for whom we have predictions. This
 type of crosstab is called a "confusion matrix." The observations in the upper
 right corner, where the indicator and the actual outcome are both 0, are true
 negatives. The observations in the lower right corner, where the indicator and
 the outcome are both 1, are true positives. The upper right corner contains
 false positives, and the lower left corner contains false negatives. Overall, if
 you add up the cell percentages for true positives and true negatives, the model
 got 56.1 percent of the predictions right.


 ```{r}
 prop.table(
  table(
    grad = sea_data$ontime_grad, pred = sea_data$grad_indicator)) %>% # shorthand way to round
  round(3)
 ```

 However, almost all of the wrong predictions are false negatives--these are
 students who have been flagged as dropout risks even though they
 did graduate ontime. If you want your indicator system to be have fewer false
 negatives, you can change the probability cutoff. This cutoff has a lower share
 of false positives and a higher share of false negatives, with a somewhat lower
 share of correct predictions.

 ```{r}
 new_thresh <- basic_thresh - 0.05
 prop.table(table(Observed = sea_data$ontime_grad, 
                 Predicted = sea_data$grad_pred > new_thresh)) %>% 
  round(3)
 ```

 Note that this table only includes the complete cases. To look at missing values 
 as well: 

 ```{r}
 prop.table(table(Observed = sea_data$ontime_grad, 
                 Predicted = sea_data$grad_pred > new_thresh, 
                 useNA="always")) %>% round(3)
 ```

 ## Missing Data

 Another key business rule is how we will handle students with missing data. A
 predictive analytics system is more useful if it makes an actionable prediction
 for every student. It is good to check, if it is available, the graduation rates
 for students with missing data:

 ```{r}
 table(Grad = sea_data$ontime_grad, 
      miss_math = is.na(sea_data$scale_score_7_math))

 table(Grad = sea_data$ontime_grad, 
      miss_abs = is.na(sea_data$pct_days_absent_7))
 ```

 There are a number of options at this point. One is to run a model with fewer 
 variables for only those students, and then use that model to fill in the 
 missing indicators.

 ```{r}
 absence_model <- glm(ontime_grad ~ pct_days_absent_7, 
                     data = sea_data[is.na(sea_data$scale_score_7_math),], 
                     family = "binomial")
 ```

 ```{r}
 sea_data$grad_pred_2 <- predict(absence_model, newdata = sea_data, 
                                  type = "response")
 summary(absence_model)
 ```

 ```{r}
 table(sea_data$grad_indicator, useNA="always")
 sea_data$grad_indicator[is.na(sea_data$grad_pred) &  
                            sea_data$grad_pred_2 < new_thresh] <- 0
 sea_data$grad_indicator[is.na(sea_data$grad_pred) &  
                            sea_data$grad_pred_2 >= new_thresh] <- 1
 table(sea_data$grad_indicator, useNA="always")
 ```

 We now have predictions for all but a very small share of students, and those
 students are split between graduates and non-graduates. We have to apply a rule
 or a model to make predictions for them--we can't use information from the
 future, except to develop the prediction system. We'll arbitrarily decide to
 flag them as potential non-graduates, since students with lots of missing data
 might merit some extra attention.

 ```{r}
 table(sea_data$grad_indicator, sea_data$ontime_grad, useNA = "always")
 sea_data$grad_indicator[is.na(sea_data$grad_indicator)] <- 0
 ```
 	
 ## Evaluate Fit

 Now we have a complete set of predictions from our simple models. How well does
 the prediction system work? Can we do better?

 ```{r}
 table(Observed = sea_data$ontime_grad, Predicted = sea_data$grad_indicator) %>% 
  prop.table %>% round(4)
 ```

 A confusion matrix is one way to evaluate the success of a model and evaluate
 tradeoffs as you are developing prediction systems, but there are others. We 
 will cover these more in the workshop, but in cases where we have an uneven 
 proportion of cases in each class (e.g. we have many more graduates than non-graduates), 
 it can be helpful to look at a metric like the AUC, which stands for "area under the
 curve." You'll learn more about ways to evaluate a prediction system, including
 the AUC metric, during Day 1 of the workshop, but here's a sneak peak. 

 First, use the `roc` command to plot the true positive rate (sensitivity in
 the graph) against the false positive rate (1-specificity in the graph). 

 ```{r calculateROC}
 roc(sea_data$ontime_grad, sea_data$grad_indicator) %>% plot
 ```

 You can also calculate ROC on the continuouse predictor as well, to help you 
 determine the threshold:


 ```{r calculateROC2}
 roc(sea_data$ontime_grad, sea_data$grad_pred) %>% plot
 ```

 You can also calculate the numeric summaries instead of just the graphs. To 
 do this let's use the `caret` package:

 ```{r}
 # We must wrap these each in calls to factor because of how this function expects 
 # the data to be formatted
 caret::confusionMatrix(data = factor(sea_data$grad_indicator), 
                       reference =factor(sea_data$ontime_grad), positive = "1")
 ```

 A couple of last thoughts and notes. First, note that so far we haven't done any
 out-of-sample testing. We know from the pre-reading that we should never trust 
 our model fit measures on data the model was fit to -- statistical models are 
 overly confident. To combat this, you should subdivide your dataset. There are 
 many strategies you can choose from depending on how much data you have and the 
 nature of your problem - for the EWS case, we can use the first two cohorts to 
 build our models and the latter two cohorts to evaluate that fit. 

 Here is some code you can use to do that: 

 ```{r}
 # In R we can define an index of rows so we do not have to copy our data
 train_idx <- row.names(sea_data[sea_data$year %in% c(2003, 2004, 2005),])
 test_idx <- !row.names(sea_data) %in% train_idx

 fit_model <- glm(ontime_grad ~ scale_score_7_math + frpl_7, 
                 data = sea_data[train_idx, ], family = "binomial")

 sea_data$grad_pred_3 <- predict(fit_model, newdata = sea_data, type = "response")
 summary(sea_data$grad_pred_3)
 # Check the test index only
 summary(sea_data$grad_pred_3[test_idx])

 # calculate matrix of the test_index

 table(Observed = sea_data$ontime_grad[test_idx], 
      Predicted = sea_data$grad_pred_3[test_idx] > new_thresh) %>% 
  prop.table() %>% round(4)

 ```

 Second, should we use subgroup membership variables (such as demographics or
 school of enrollment?) to make predictions, if they improve the accuracy of
 predictions? This is more a policy question than a technical question, and you
 should consider it when you are developing your models. You'll also want to
 check to see how accurate your model is for different subgroups.


 ## Bonus

 The above code will be more than enough to get you started. But, if you want 
 to reach further into predictive analytics, the next section provides some bonus 
 syntax and advice for building more complex statistical models. 

 Logistic regression is where you should start. It is fast to compute, easier to 
 interpret, and usually does a great job. However, there are a number of alternative 
 algorithms available to you and R provides a common interface to them through the 
 `caret` package. The `train` function (as in, training the models) is the 
 workhorse of the `caret` package. It has an extensive set of user-controlled 
 options. 

 When switching away from logistic regression, it can be advantageous to transform 
 predictors to be centered at 0 with a standard deviation of 1. This helps 
 put binary and continuous indicators on a more similar scale and helps avoid 
 problems associated with rounding, large numbers, and optimization algorithms 
 used to evaluate model fit. Here is an example of how you can do this in R:

 ```{r}
 library(caret) # machine learning workhorse in R
 sea_data <- as.data.frame(sea_data)

 # To use caret we have to manually expand our categorical variables. caret 
 # can use formulas like lm() or glm() but becomes very slow and unreliable 
 # for some model/algorithm methods when doing so. Better to do it ourselves.

 # Expand categorical variables
 # First declare any variable we want to be treated as a category as a factor
 sea_data$frpl_7 <- factor(sea_data$frpl_7)
 sea_data$male <- factor(sea_data$male)
 # Use the dummyVars function to build an expansion function to expand our data
 expand_factors <- caret::dummyVars(~ race_ethnicity + frpl_7 + male,
                            data = sea_data)
 # Create a matrix of indicator variables by using the predict method for the 
 # expand_factors object we just created. There are other ways you can do this, but 
 # the advantage is we can use the `expand_factors` object on future/different 
 # datasets to preserve factor levels that may not be present in those (like our 
 # test set perhaps)
 fact_vars <- predict(expand_factors, sea_data)
 # Get the column names of our categorical dummy variables
 categ_vars <- colnames(fact_vars)
 # Combine the new dummy variables with our original dataset
 sea_data <- cbind(sea_data, fact_vars)
 # Drop the matrix of dummy variables
 rm(fact_vars)

 # Define the numeric/continuous predictors we want to use
 continuous_x_vars <- c('scale_score_7_math', 'scale_score_7_read', 
                          'pct_days_absent_7', 'sch_g7_frpl_per')

 # caret does not gracefully handle missing data, so we have to do some additional 
 # work when we define our training/test data split. You can choose additional 
 # ways to address missing data (imputation, substituting mean values, etc.) - 
 # here we opt for the simple but aggressive strategy of excluding any row 
 # with missing values for any predictor from being in the training data

 train_idx <- row.names( # get the row.names to define our index
  na.omit(sea_data[sea_data$year %in% c(2003, 2004, 2005), 
                   # reduce the data to rows of specific years and not missing data
                   c(continuous_x_vars, categ_vars)])
  # only select the predictors we are interested in
 )
 test_idx <- !row.names(sea_data[sea_data$year > 2005,]) %in% train_idx

 # All of our algorithms will run much faster and more smoothly if we center 
 # our continuous measures at 0 with a standard deviation of 1. We have 
 # skipped the important step of identifying and removing outliers, which we 
 # should make sure to do, but is left up to you!
 pre_proc <- caret::preProcess(sea_data[train_idx, continuous_x_vars], 
                       method = c("scale", "center"))

 # We now have a pre-processing object which will scale and center our variables
 # for us. It will ignore any variables that are not defined within it, so we 
 # can pass all of our continuous and dummy variables to it to produce our 
 # final data frame of predictors. 
 preds <- predict(pre_proc, 
                 sea_data[train_idx, # keep only the training rows
                          c(continuous_x_vars, categ_vars)]
                 ) # keep only the columns of dummy and continuous variables

 ```


 Once we have defined our predictor variables, we need to tell `train` how we want 
 to test our models. Most of the algorithms offered through `caret` have "tuning 
 parameters", user-controlled values, that are not estimated from the data. Our 
 goal is to experiment with these values and find the values that fit the data 
 the best. To do this, we must tell `train` which values to try, and how to 
 evaluate their performance. 

 Luckily, `train()` has a number of sensible defaults that largely automate this 
 process for us. For the purpose of this exercise, a good set of defaults is to 
 use the `twoClassSummary()` model evaluation function (which tells us the area 
 under the curve as well as the sensitivity, specificity, and accuracy) and 
 to use repeated cross-fold validation. 

 ```{r}
 # Take advantage of all the processing power on your machine
 library(doFuture)
 plan(multiprocess(workers = 8)) # define the number of cpus to use
 registerDoFuture() # register them with R

 # Caret really really really likes if you do binary classification that you 
 # code the variables as factors with alphabetical labels. In this case, we 
 # recode 0/1 to be nongrad, grad.

 yvar <- sea_data[train_idx, "ontime_grad"] # save only training observations
 yvar <- ifelse(yvar == 1, "grad", "nongrad")
 yvar <- factor(yvar)

 # On a standard desktop/laptop it can be necessary to decrease the sample size 
 # to train in a reasonable amount of time. For the prototype and getting feedback 
 # it's a good idea to stick with a reasonable sample size of under 30,000 rows. 
 # Let's do that here:

 train_idx_small <- sample(1:nrow(preds), 2e4)

 # Caret has a number of complex options, you can read about under ?trainControl
 # Here we set some sensible defaults
 example_control <- trainControl(
  method = "cv", # we cross-validate our model to avoid overfitting
  classProbs = TRUE,  # we want to be able to predict probabilities, not just binary outcomes
  returnData = TRUE, # we want to store the model data to allow for postestimation
  summaryFunction = prSummary, # we want to use the prSummary for better two-class accuracy measures
  trim = TRUE, # we want to reduce the size of the final model object to save RAM
  savePredictions = "final", # we want to store the predictions from the final model
  returnResamp = "final", # we want to store the resampling code to directly compare other methods
  allowParallel = TRUE # we want to use all the processors on our computer if possible
  )

 # This will take quite some time:
 set.seed(2532) # set seed so models are comparable
 rpart_model <- train(
  y = yvar[train_idx_small], # specify the outcome, here subset 
  # to the smaller number of observations
  x = preds[train_idx_small,], # specify the matrix of predictors, again subset 
  method = "rpart",  # choose the algorithm - here a regression tree
  tuneLength = 24, # choose how many values of tuning parameters to test
  trControl = example_control, # set up the conditions for the test (defined above)
   metric = "AUC" # select the metric to choose the best model based on
  )


 set.seed(2532)
 # Repeat above but just change the `method` argument to nb for naiveBayes
 nb_model <- train(y = yvar[train_idx_small], 
             x = preds[train_idx_small,], 
             method = "nb", tuneLength = 24, 
             trControl = example_control, 
             metric = "AUC")

 # Construct a list of model performance comparing the two models directly
 resamps <- resamples(list(RPART = rpart_model, 
              NB = nb_model))
 # Summarize it
 summary(resamps)
 # plot it
 # see ?resamples for more options
 dotplot(resamps, metric = "AUC")

 ```


 ## Test Your Results

 Now we need to evaluate our performance on our hold-out set of data. 

 ```{r}
 # We use the pre-processing we defined on the training data, but this time 
 # we create test_data with these values:

 test_data <- predict(pre_proc, 
                     sea_data[test_idx, # specify the rows in the test set
                        c(continuous_x_vars, categ_vars)]) # keep the same vars

 # Create a vector of outcomes for the test set
 test_y <- sea_data[test_idx, "ontime_grad"]
 test_y <- ifelse(test_y == 1, "grad", "nongrad")
 test_y <- factor(test_y)

 confusionMatrix(reference = test_y, data = predict(rpart_model, test_data))
 # Note that making predictions from the nb classifier can take a long time
 # consider alternative models or making fewer predictions if this is a bottleneck
 confusionMatrix(reference = test_y, data = predict(nb_model, test_data))
 ```

 We can also make ROC plots of our test-set, out of sample, prediction accuracy.

 ```{r}
 # ROC plots
 yhat <- predict(rpart_model, test_data, type = "prob")
 yhat <- yhat$grad
 roc(response = test_y, predictor = yhat <- yhat) %>% plot
 # NB ROC plot
 # Note that making predictions from the nb classifier can take a long time
 # consider alternative models or making fewer predictions if this is a bottleneck
 yhat <- predict(nb_model, test_data, type = "prob")
 yhat <- yhat$grad
 roc(response = test_y, predictor = yhat <- yhat) %>% plot
 ```


 ## References and More

 The `caret` package has extensive documentation which you can use to find 
 new models, learn about algorithms, and explore many other important aspects 
 of building predictive analytics. You can learn more here: 

 https://topepo.github.io/caret/
	###############################################################################
	## SDP Fall Workshop Predictive Analytics
	## Advanced / Additional Code Snippets for Working with PA Data and Models
	## Author: Jared E. Knowles
	## Date: 09/14/2018
	## You do not need to use all or even any of this code. The code does not need to
	## be run together. This is just a survey of some additional techniques/tricks you
	## can do in R to make explaining predictive models and complex data easier.
	## As always - your needs and approaches may different.
	################################################################################
	# Load the data
	load("data/montucky.rda")


	########################################################################
	# Build and plot a regression tree
	########################################################################
	library(rpart.plot)
	library(rpart)

	# Basic partition tree model to plot
	binary.model <- rpart(ontime_grad ~ scale_score_7_math +
	pct_days_absent_7 + ell_7 + iep_7 + frpl_7 + male +
	race_ethnicity +
	sch_g7_lep_per + sch_g7_gifted_per,
	data = sea_data,
	control = rpart.control(cp = .005))

	# Pass the model object to the plot function
	# You can look at ?rpart.plot to identify customizations you wish to use
	rpart.plot(binary.model)

	# To improve the labels, you can rename the variables in the data
	sea_data$short_race_label <- NA
	sea_data$short_race_label[sea_data$race_ethnicity == "Hispan..."] <- "H"
	sea_data$short_race_label[sea_data$race_ethnicity == "White"] <- "W"
	sea_data$short_race_label[sea_data$race_ethnicity == "Asian"] <- "A"
	sea_data$short_race_label[sea_data$race_ethnicity == "Americ..."] <- "AI"
	sea_data$short_race_label[sea_data$race_ethnicity == "Demogg..."] <- "M"
	sea_data$short_race_label[sea_data$race_ethnicity == "Native..."] <- "HA"
	sea_data$short_race_label[sea_data$race_ethnicity == "Black ..."] <- "B"

	# Fit a classification model, just like a logistic regression formula interface
	binary.model <- rpart(ontime_grad ~ scale_score_7_math +
	pct_days_absent_7 + ell_7 + iep_7 + frpl_7 + male +
	short_race_label +
	sch_g7_lep_per + sch_g7_gifted_per,
	data = sea_data,
	control = rpart.control(cp = .005))

	# Make a plot showing the tree
	rpart.plot(binary.model)

	########################################################################
	## Variable importance from a caret/train model object
	########################################################################
	library(caret)

	## Fit a simple model just to show the syntax, you should be fitting a model
	## split on train and test data, substitute your own model here.

	# Unfortunately, we have to remove NAs first, because
	# caret won't do it for us
	train_data <- na.omit(sea_data[, c("ontime_grad", "scale_score_7_math",
	"pct_days_absent_7", "ell_7", "iep_7",
	"male", "frpl_7", "race_ethnicity",
	"sch_g7_lep_per", "sch_g7_gifted_per")])

	# Outcome must be a factor in R to work with caret, so we recode
	# Avoid 0/1 factor labels - this causes problems in fitting many model types.
	train_data$ontime_grad <- ifelse(train_data$ontime_grad == 1, "Grad", "Nongrad")
	train_data$ontime_grad <- factor(train_data$ontime_grad)

	# Fit the model
	caret_mod <- train(ontime_grad ~ .,
	method = "rpart",
	data = train_data,
	metric = "AUC",
	trControl = trainControl(summaryFunction = prSummary,
	classProbs = TRUE,
	method = "cv"))

	# Get variable importance, not available for all methods
	caret::varImp(caret_mod)

	# Plot variable importance
	plot(caret::varImp(caret_mod))


	########################################################################
	## Probablity accuracy
	## Plot the relationship between predicted probabilities and observed
	## graduation rates to explore thresholds.
	########################################################################
	library(yardstick)
	library(dplyr)

	# Get the predicted classifications from the caret model above
	class_vec <- predict(caret_mod, newdata = train_data)
	# Get the predicted probabilities from the model above
	## Note that caret insists on giving probabilities for both classes, we
	## need to store only one, in this case, the first one
	prob_vec <- predict(caret_mod, newdata = train_data, type = "prob")[[1]] # only need first column

	# Combine the true values, the estimated class, and the class probability
	# into one dataframe for plotting
	estimates_tbl <- data.frame(
	truth = as.factor(train_data$ontime_grad),
	estimate = as.factor(class_vec),
	class_prob = prob_vec
	)

	# Using this struture we can use the `yardstick` package to nicely
	# compute a variety of performanc emetrics
	## Confusion matrix
	estimates_tbl %>% yardstick::conf_mat(truth, estimate)
	# Accuracy
	estimates_tbl %>% yardstick::metrics(truth, estimate)
	# AUC
	estimates_tbl %>% yardstick::roc_auc(truth, class_prob)

	# ROC graph
	# Plots the ROC curve (the ROC at all possible threshold values for the
	# probability cutoff)

	library(pROC)
	roc(estimates_tbl$truth, estimates_tbl$class_prob) %>% plot

	# Save the confusion matrix as an R object
	conf_mat <- estimates_tbl %>% yardstick::conf_mat(truth, estimate)

	# Plot the confusion matrix if you like
	library(vcd)
	labs <- round(prop.table(conf_mat$table), 2)
	# Can change the margin to change the labels
	mosaic(conf_mat$table, pop=FALSE)
	labeling_cells(text = labs, margin = 0)(conf_mat$table)

	################################################################################
	## Probability vs. outcome plot for predicted probabilities
	################################################################################

	plotdf <- estimates_tbl %>%
	mutate(prob_cut = percent_rank(class_prob)) %>%
	# depending on how many unique probabilities your model produces you can try
	# mutate(prob_cut = ntile(class_prob, 50)) %>%
	group_by(prob_cut) %>%
	summarize(
	avg_prob = mean(class_prob),
	prob_grad = sum(truth == "Grad") / length(truth),
	count = n())


	library(ggplot2)
	library(ggalt) # for fun lollipop charts

	ggplot(plotdf, aes(x = avg_prob, y = prob_grad)) +
	ggalt::geom_lollipop() + theme_classic() +
	coord_cartesian(xlim = c(0, 1), ylim = c(0, 1), expand=FALSE) +
	geom_smooth(se=FALSE)

	################################################################################
	## Advanced: Make a Bowers plot
	#################################################################################
	# Get the data for comparable points
	url <- "https://raw.githubusercontent.com/jknowles/DEWSatDoGoodData2016/master/data/BowersEWSReviewData.csv"
	ews <- read.csv(url)

	# Clean up the coding of the data with better labels
	# This is optional, but it highlights some reference
	# models
	ews$flag <- "Other EWI"
	ews$flag[ews$id==1 \| ews$id==2] <- "Chicago On-Track"
	ews$flag[ews$id > 3 & ews$id <14] <- "Balfanz ABC"
	ews$flag[ews$id ==85] <- "Muth?n Math GMM"
	ews$flag[ews$id==19] <- "Bowers GPA GMM"
	ews$flag <- factor(ews$flag)
	ews$flag <- relevel(ews$flag, ref="Other EWI")
	mycol <- c("Other EWI" = "gray70", "Chicago On-Track" = "blue",
	"Balfanz ABC" = "purple",
	"Muth?n Math GMM" = "orange",
	"Bowers GPA GMM" = "dark red")

	# The big block of plotting code, you only need to modify the last
	# couple of lines, the rest adds annotations and labels which you
	# can switch on and off if you like

	ggplot(ews) + aes(x = 1-specificity, y = sensitivity,
	shape = flag, size = I(4), color = flag) +
	geom_point() +
	# Label the shape legend
	scale_shape("EWI Type") +
	# Label the color legend and customize it
	scale_color_manual("EWI Type", values = mycol) +
	# Add a reference 45 degree line representing random chance
	geom_abline(intercept = 0, slope = 1, linetype = 2) +
	# Set the scales of the chart to not distort distances
	coord_cartesian(xlim = c(0, 1), ylim = c(0, 1)) +
	# Add a simple theme to avoid clutter
	theme_bw() +
	# Label the axes
	labs(x="False Alarm Proportion", y="True Positive Proportion",
	title = "ROC Accuracy of Early Warning Indicators") +
	# Place the legend in the bottom right
	theme(legend.position = c(0.8, 0.2),
	legend.background = element_rect(fill = NULL,
	color = "black")) +
	# Add arrows to annotate the plot
	annotate(geom="segment", x = 0.55, y = 0.625,
	yend = 0.785, xend = 0.4,
	arrow = arrow(length = unit(0.5, "cm"))) +
	# Label the arrow
	annotate(geom="text", x = .365, y = .81, label="Better Prediction") +
	annotate(geom="segment", x = 0.65, y = 0.625, yend = 0.5,
	xend = 0.75, arrow = arrow(length = unit(0.5, "cm"))) +
	annotate(geom="text", x = .75, y = .48, label="Worse Prediction") +
	annotate(geom="text", x = .75, y = .78, angle = 37, label="Random Guess") +
	# Add your annotation here:
	annotate(geom="point",
	x = 1-estimates_tbl %>% yardstick::spec(truth, estimate),
	y = estimates_tbl %>% yardstick::sens(truth, estimate),
	size = 5, color = "red")

	# Use a custom probability threshold