Created
          September 16, 2018 17:24 
        
      - 
      
- 
        Save jknowles/26217964de2d2f145be50b701382b024 to your computer and use it in GitHub Desktop. 
  
    
      This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
      Learn more about bidirectional Unicode characters
    
  
  
    
  | ############################################################################### | |
| ## SDP Fall Workshop Predictive Analytics | |
| ## Advanced / Additional Code Snippets for Working with PA Data and Models | |
| ## Author: Jared E. Knowles | |
| ## Date: 09/14/2018 | |
| ## You do not need to use all or even any of this code. The code does not need to | |
| ## be run together. This is just a survey of some additional techniques/tricks you | |
| ## can do in R to make explaining predictive models and complex data easier. | |
| ## As always - your needs and approaches may different. | |
| ################################################################################ | |
| # Load the data | |
| load("data/montucky.rda") | |
| ######################################################################## | |
| # Build and plot a regression tree | |
| ######################################################################## | |
| library(rpart.plot) | |
| library(rpart) | |
| # Basic partition tree model to plot | |
| binary.model <- rpart(ontime_grad ~ scale_score_7_math + | |
| pct_days_absent_7 + ell_7 + iep_7 + frpl_7 + male + | |
| race_ethnicity + | |
| sch_g7_lep_per + sch_g7_gifted_per, | |
| data = sea_data, | |
| control = rpart.control(cp = .005)) | |
| # Pass the model object to the plot function | |
| # You can look at ?rpart.plot to identify customizations you wish to use | |
| rpart.plot(binary.model) | |
| # To improve the labels, you can rename the variables in the data | |
| sea_data$short_race_label <- NA | |
| sea_data$short_race_label[sea_data$race_ethnicity == "Hispan..."] <- "H" | |
| sea_data$short_race_label[sea_data$race_ethnicity == "White"] <- "W" | |
| sea_data$short_race_label[sea_data$race_ethnicity == "Asian"] <- "A" | |
| sea_data$short_race_label[sea_data$race_ethnicity == "Americ..."] <- "AI" | |
| sea_data$short_race_label[sea_data$race_ethnicity == "Demogg..."] <- "M" | |
| sea_data$short_race_label[sea_data$race_ethnicity == "Native..."] <- "HA" | |
| sea_data$short_race_label[sea_data$race_ethnicity == "Black ..."] <- "B" | |
| # Fit a classification model, just like a logistic regression formula interface | |
| binary.model <- rpart(ontime_grad ~ scale_score_7_math + | |
| pct_days_absent_7 + ell_7 + iep_7 + frpl_7 + male + | |
| short_race_label + | |
| sch_g7_lep_per + sch_g7_gifted_per, | |
| data = sea_data, | |
| control = rpart.control(cp = .005)) | |
| # Make a plot showing the tree | |
| rpart.plot(binary.model) | |
| ######################################################################## | |
| ## Variable importance from a caret/train model object | |
| ######################################################################## | |
| library(caret) | |
| ## Fit a simple model just to show the syntax, you should be fitting a model | |
| ## split on train and test data, substitute your own model here. | |
| # Unfortunately, we have to remove NAs first, because | |
| # caret won't do it for us | |
| train_data <- na.omit(sea_data[, c("ontime_grad", "scale_score_7_math", | |
| "pct_days_absent_7", "ell_7", "iep_7", | |
| "male", "frpl_7", "race_ethnicity", | |
| "sch_g7_lep_per", "sch_g7_gifted_per")]) | |
| # Outcome must be a factor in R to work with caret, so we recode | |
| # Avoid 0/1 factor labels - this causes problems in fitting many model types. | |
| train_data$ontime_grad <- ifelse(train_data$ontime_grad == 1, "Grad", "Nongrad") | |
| train_data$ontime_grad <- factor(train_data$ontime_grad) | |
| # Fit the model | |
| caret_mod <- train(ontime_grad ~ ., | |
| method = "rpart", | |
| data = train_data, | |
| metric = "AUC", | |
| trControl = trainControl(summaryFunction = prSummary, | |
| classProbs = TRUE, | |
| method = "cv")) | |
| # Get variable importance, not available for all methods | |
| caret::varImp(caret_mod) | |
| # Plot variable importance | |
| plot(caret::varImp(caret_mod)) | |
| ######################################################################## | |
| ## Probablity accuracy | |
| ## Plot the relationship between predicted probabilities and observed | |
| ## graduation rates to explore thresholds. | |
| ######################################################################## | |
| library(yardstick) | |
| library(dplyr) | |
| # Get the predicted classifications from the caret model above | |
| class_vec <- predict(caret_mod, newdata = train_data) | |
| # Get the predicted probabilities from the model above | |
| ## Note that caret insists on giving probabilities for both classes, we | |
| ## need to store only one, in this case, the first one | |
| prob_vec <- predict(caret_mod, newdata = train_data, type = "prob")[[1]] # only need first column | |
| # Combine the true values, the estimated class, and the class probability | |
| # into one dataframe for plotting | |
| estimates_tbl <- data.frame( | |
| truth = as.factor(train_data$ontime_grad), | |
| estimate = as.factor(class_vec), | |
| class_prob = prob_vec | |
| ) | |
| # Using this struture we can use the `yardstick` package to nicely | |
| # compute a variety of performanc emetrics | |
| ## Confusion matrix | |
| estimates_tbl %>% yardstick::conf_mat(truth, estimate) | |
| # Accuracy | |
| estimates_tbl %>% yardstick::metrics(truth, estimate) | |
| # AUC | |
| estimates_tbl %>% yardstick::roc_auc(truth, class_prob) | |
| # ROC graph | |
| # Plots the ROC curve (the ROC at all possible threshold values for the | |
| # probability cutoff) | |
| library(pROC) | |
| roc(estimates_tbl$truth, estimates_tbl$class_prob) %>% plot | |
| # Save the confusion matrix as an R object | |
| conf_mat <- estimates_tbl %>% yardstick::conf_mat(truth, estimate) | |
| # Plot the confusion matrix if you like | |
| library(vcd) | |
| labs <- round(prop.table(conf_mat$table), 2) | |
| # Can change the margin to change the labels | |
| mosaic(conf_mat$table, pop=FALSE) | |
| labeling_cells(text = labs, margin = 0)(conf_mat$table) | |
| ################################################################################ | |
| ## Probability vs. outcome plot for predicted probabilities | |
| ################################################################################ | |
| plotdf <- estimates_tbl %>% | |
| mutate(prob_cut = percent_rank(class_prob)) %>% | |
| # depending on how many unique probabilities your model produces you can try | |
| # mutate(prob_cut = ntile(class_prob, 50)) %>% | |
| group_by(prob_cut) %>% | |
| summarize( | |
| avg_prob = mean(class_prob), | |
| prob_grad = sum(truth == "Grad") / length(truth), | |
| count = n()) | |
| library(ggplot2) | |
| library(ggalt) # for fun lollipop charts | |
| ggplot(plotdf, aes(x = avg_prob, y = prob_grad)) + | |
| ggalt::geom_lollipop() + theme_classic() + | |
| coord_cartesian(xlim = c(0, 1), ylim = c(0, 1), expand=FALSE) + | |
| geom_smooth(se=FALSE) | |
| ################################################################################ | |
| ## Advanced: Make a Bowers plot | |
| ################################################################################# | |
| # Get the data for comparable points | |
| url <- "https://raw.githubusercontent.com/jknowles/DEWSatDoGoodData2016/master/data/BowersEWSReviewData.csv" | |
| ews <- read.csv(url) | |
| # Clean up the coding of the data with better labels | |
| # This is optional, but it highlights some reference | |
| # models | |
| ews$flag <- "Other EWI" | |
| ews$flag[ews$id==1 | ews$id==2] <- "Chicago On-Track" | |
| ews$flag[ews$id > 3 & ews$id <14] <- "Balfanz ABC" | |
| ews$flag[ews$id ==85] <- "Muth?n Math GMM" | |
| ews$flag[ews$id==19] <- "Bowers GPA GMM" | |
| ews$flag <- factor(ews$flag) | |
| ews$flag <- relevel(ews$flag, ref="Other EWI") | |
| mycol <- c("Other EWI" = "gray70", "Chicago On-Track" = "blue", | |
| "Balfanz ABC" = "purple", | |
| "Muth?n Math GMM" = "orange", | |
| "Bowers GPA GMM" = "dark red") | |
| # The big block of plotting code, you only need to modify the last | |
| # couple of lines, the rest adds annotations and labels which you | |
| # can switch on and off if you like | |
| ggplot(ews) + aes(x = 1-specificity, y = sensitivity, | |
| shape = flag, size = I(4), color = flag) + | |
| geom_point() + | |
| # Label the shape legend | |
| scale_shape("EWI Type") + | |
| # Label the color legend and customize it | |
| scale_color_manual("EWI Type", values = mycol) + | |
| # Add a reference 45 degree line representing random chance | |
| geom_abline(intercept = 0, slope = 1, linetype = 2) + | |
| # Set the scales of the chart to not distort distances | |
| coord_cartesian(xlim = c(0, 1), ylim = c(0, 1)) + | |
| # Add a simple theme to avoid clutter | |
| theme_bw() + | |
| # Label the axes | |
| labs(x="False Alarm Proportion", y="True Positive Proportion", | |
| title = "ROC Accuracy of Early Warning Indicators") + | |
| # Place the legend in the bottom right | |
| theme(legend.position = c(0.8, 0.2), | |
| legend.background = element_rect(fill = NULL, | |
| color = "black")) + | |
| # Add arrows to annotate the plot | |
| annotate(geom="segment", x = 0.55, y = 0.625, | |
| yend = 0.785, xend = 0.4, | |
| arrow = arrow(length = unit(0.5, "cm"))) + | |
| # Label the arrow | |
| annotate(geom="text", x = .365, y = .81, label="Better Prediction") + | |
| annotate(geom="segment", x = 0.65, y = 0.625, yend = 0.5, | |
| xend = 0.75, arrow = arrow(length = unit(0.5, "cm"))) + | |
| annotate(geom="text", x = .75, y = .48, label="Worse Prediction") + | |
| annotate(geom="text", x = .75, y = .78, angle = 37, label="Random Guess") + | |
| # Add your annotation here: | |
| annotate(geom="point", | |
| x = 1-estimates_tbl %>% yardstick::spec(truth, estimate), | |
| y = estimates_tbl %>% yardstick::sens(truth, estimate), | |
| size = 5, color = "red") | |
| # Use a custom probability threshold | |
  
    
      This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
      Learn more about bidirectional Unicode characters
    
  
  
    
  | --- | |
| title: "Predictive Analytics Tutorial" | |
| author: "OpenSDP" | |
| date: "September 9, 2017" | |
| output: | |
| word_document: | |
| reference_docx: reference.docx | |
| toc: true | |
| --- | |
| ```{r setup, include=FALSE} | |
| knitr::opts_chunk$set(echo = TRUE, warning=FALSE, message=FALSE, error = FALSE, | |
| results = "hide", fig.show="hide") | |
| ``` | |
| ## Introduction | |
| This file provides guidance and R syntax examples for the hands-on | |
| predictive analytics session during the Fall 2018 Cohort 9 Strategic Data | |
| Project Workshop in Denver. | |
| During the workshop, we'll ask you work with stakeholders to develop a dropout | |
| early warning system for 7th grade students for a simulated state education | |
| agency (the state of Montucky) using student data. This guide will provide you | |
| with a baseline model to get you started and introduce you to the main concepts | |
| of predictive anlaytics. From there, you will work together with stakeholders to | |
| incorporate their values into the model, secure their support and adoption of | |
| the model, and practice an inclusive and iterative design process to building | |
| analytics. | |
| As a data analyst, your role in this process is to measure the accuracy of the | |
| model and communicate the tradeoffs of different models in terms of accuracy | |
| and usability. Together with stakeholders, you might find that the most accurate | |
| model does not reflect the values of the end the users. Perhaps the most accurate | |
| model requires variables not available for all students, or predicts all students | |
| in certain categories to be at-risk. These are issues you will have to work | |
| together with stakeholders to address. | |
| Logistic regression is one tool you can use, and we'll demonstrate it here. | |
| There are many other techniques of increasing complexity. (Many of the best | |
| predictive analytics packages are written in the R programming language.) But | |
| for a binary outcome variable, most data scientists start with logistic | |
| regressions, and those are very straightforward to do in R. | |
| Here are the steps: | |
| 1. explore the data, especially college enrollment predictors and outcomes | |
| 2. examine the relationship between predictors and outcomes | |
| 3. evaluate the predictive power of different variables and select predictors for your model | |
| 4. make predictions using logistic regression | |
| 5. convert the predicted probabilities into a 0/1 indicator | |
| 6. look at the effect of different probability cutoffs on prediction accuracy (develop a "confusion matrix") | |
| This guide will take you through these steps using a baseline model for predicting | |
| graduation using student grade 7 data. During the workshop you will iterate on | |
| this baseline model using the process above and find an improved model that | |
| meets stakeholder needs. | |
| The commands in this script won't tell you everything you need to do to develop | |
| your model, but they will give you command | |
| syntax that you should be able to adjust and adapt to get the project done. | |
| You should also consider non-model approaches like the "checklist" approaches | |
| described in the pre-reading created at the Chicago Consortium of School | |
| Research (CCSR). With that "checklist" approach, you experiment with different | |
| thresholds for your predictor variables, and combine them to directly predict | |
| the outcome. This approach has the advantage of being easy to explain and | |
| implement, but it might not yield the most accurate predictions. We won't | |
| demonstrate that approach here, but if you want to try it you can draw on the | |
| syntax examples here to develop it. | |
| Before you get started, you need to think about variables, time, and datasets. | |
| The sooner in a student's academic trajectory you can make a prediction, the | |
| sooner you can intervene--but the less accurate your predictions, and hence your | |
| intervention targeting, is likely to be. What data, and specifically which | |
| variables, do you have available to make predictions? What outcome are you | |
| trying to predict? | |
| In the case of the workshop, we are limiting our focus to using data available | |
| at the end of grade 7 to predict outcomes at the end of high school. A critical | |
| step in developing a predictive model is to identify the time points that | |
| different measures are collected during the process you are predicting -- you | |
| can't use data fromthe future to make predictions. If you're planning to use | |
| your model to make predictions for students at the end of 11th grade, for | |
| instance, and if most students take AP classes as seniors, you can't use data | |
| about AP coursetaking collected during senior year to predict the likelihood of | |
| college enrollment, even if you have that data available for past groups of | |
| students. | |
| In terms of datasets, you can develop your model and then test its accuracy on | |
| the dataset you used to develop the model, but that is bad practice--in the real | |
| world, your model is only as good as its predictions on different, out of sample | |
| datasets. It's good practice to split your data into three parts: one part for | |
| developing your model, one for repeatedly testing different versions of your | |
| model, and a third to use for a final out of sample test. | |
| We're using multiple cohorts of middle-school students for the predictive analytics | |
| task--students who were seventh graders between 2007 and in 2011. These are the | |
| cohorts for which we have access to reliable information on their high school | |
| graduation status (late graduate, on time graduate, dropout, transferout, | |
| disappeared). For this workshop you are being given the entire set of data so | |
| that you can explore different ways of organizing the data across cohorts. | |
| An example is that you may choose to explore your models and data using the | |
| earlier cohort, and evaluate their performance on more recent cohorts. | |
| One last point -- even though the data is synthetic, we have simulated missing data | |
| for you. In the real world, you'll need to make predictions for every | |
| student, even if you're missing data for that student which your model needs in | |
| order to run. Just making predictions using a logistic regression won't be | |
| enough. You'll need to use decision rules based on good data exploration and | |
| your best judgment to predict and fill in outcomes for students where you have | |
| insufficient data. | |
| ## Getting Started | |
| If you're using the `Rmd` file version of these materials, start by saving a new | |
| version of the file, so you can edit it without worrying about overwriting the | |
| original. Then work through the file inRStudio by highlighting one or a few | |
| command lines at a time, clicking the "execute" icon (or pressing | |
| control-enter), and then looking at the results in the R console. Edit or add | |
| commands as you wish. If you're using a paper or PDF version of these | |
| materials, just read on--the R output appears below each section of commands. | |
| This guide is built using the data you will be working with on the workshop. | |
| This dataset includes simulated data for multiple cohorts of 7th graders along | |
| with their corresponding high school outcomes. Each observation (row) is a student, | |
| and includes associated information about the student's demographics, academic | |
| performance in 7th grade, associated school and district (in grade 7), last | |
| high school attended, and high school completion. | |
| To work through this script, you need to put the | |
| `montucky.csv` data file on your computer in a working folder of | |
| your choice, and then edit the commands below to | |
| tell R where to look for the data. If you have trouble doing this, ask for | |
| help from other members of your group. | |
| ## Setup | |
| To prepare for this project you will need to ensure that your R installation | |
| has the necessary add-on packages and that you can read in the training data. | |
| ```{r installPackages, eval=FALSE} | |
| # Install add-on packages needed | |
| install.packages("dplyr") # this will update your installed version to align with | |
| install.packages("pROC") # those in the tutorial | |
| install.packages("devtools") | |
| install.packages("caret") # for machine learning | |
| install.packages("future") # for multicore processing on Windows/Mac/Linux | |
| ``` | |
| ```{r loadWorkspace} | |
| # Load the packages you need | |
| library(dplyr) | |
| library(pROC) | |
| library(devtools) | |
| library(future) | |
| # Load the helper functions not in packages | |
| devtools::source_gist("ed47cd156462a9900df1f77a000f4a52", | |
| filename = "helper_funcs.R") | |
| # Read in the data | |
| # This command assumes that the data is in a folder called data, below your | |
| # current working directory. You can check your working directory with the | |
| # getwd() command, and you can set your working directory using the RStudio | |
| # environment, or the setwd() command. | |
| load("data/montucky.rda") | |
| ``` | |
| ### Validate the data | |
| Ensure that the data imported correctly. | |
| First, ensure that the data is unique by student ID. | |
| ```{r} | |
| nrow(sea_data) == n_distinct(sea_data$sid) | |
| ``` | |
| Wait, what is the issue here? | |
| ```{r} | |
| table(sea_data$sid == sea_data$sid[[1]]) # test how many times the first sid appears | |
| ``` | |
| Why might our IDs be repeated 45 times? Let's look at how many LEAs we have in | |
| our SEA dataset: | |
| ```{r} | |
| length(unique(sea_data$sch_g7_lea_id)) # test how many LEAs are in our data | |
| ``` | |
| We see that our student IDs are not unique by LEA. That's an easy enough | |
| fix. | |
| ```{r} | |
| nrow(sea_data) == n_distinct(sea_data$sid, sea_data$sch_g7_lea_id) | |
| ``` | |
| Let's append the LEA ID onto the student ID to make student IDs truly unique: | |
| ```{r} | |
| sea_data$sid <- paste(sea_data$sid, sea_data$sch_g7_lea_id, sep = "-") | |
| ``` | |
| ```{r} | |
| nrow(sea_data) == n_distinct(sea_data$sid) | |
| ``` | |
| ## Explore the Data | |
| A key initial task in building an EWS is to identify the cohort membership of | |
| students. When we build a predictive model need to identify two timepoints - | |
| when we will be making the prediction, and when the outcome we are predicting | |
| will be observed. In this case, we will be making the prediction upon receiving | |
| the 7th grade data on students (so the completion of 7th grade), and we will | |
| be predicting their completion of high school. | |
| Let's focus first on identifying the 7th grade year for each student. We have | |
| three year variables, what is their relationship: | |
| ```{r} | |
| table(sea_data$year) | |
| table(sea_data$cohort_year) | |
| table(sea_data$cohort_grad_year) | |
| ``` | |
| From the data dictionary we know that the first year variable is the year | |
| that corresponds with the student entering 7th grade (`year`). The `cohort_year` | |
| variable defines the 9th grade cohort a student belongs to for measuring ontime | |
| graduation. Finally, the `cohort_grad_year` is the year that the cohort a | |
| student is a member of should graduate to be considered "on-time". | |
| If a student graduates, their year of graduation is recorded as `year_of_graduation`. | |
| The definition of graduation types is an important design decision for our | |
| predictive analytic system. We have to set a time by which students are graduated | |
| so we know whether to count them as graduating or not completing. The state of | |
| Montucky uses a 4-year cohort graduation rate for most reporting, defining | |
| ontime graduation as graduation within 4 years of entering high school. This | |
| variable is defined as: | |
| ```{r} | |
| table(sea_data$year_of_graduation == sea_data$cohort_grad_year) | |
| table(sea_data$ontime_grad) | |
| ``` | |
| This is an example of a business rule - it's a restriction on the definition | |
| of the data we make so that we can consistently use the data. In this case, it | |
| is necessary so we can definitively group students for the purposes of predicting | |
| their outcomes. You could consider alternative graduation timelines, for example: | |
| ```{r} | |
| table(sea_data$year_of_graduation <= sea_data$cohort_grad_year + 1) | |
| ``` | |
| What does this rule say? How is it different than the definition of on-time above? | |
| Now that we have a sense of the time structure of the data, let's look at | |
| geography. How many high schools and how many districts are? What are those | |
| regional education services coops? | |
| We are going to be building a model for the an entire state, but stakeholders | |
| may have questions about how the model works for particular schools, districts, | |
| or regions. Let's practice exploring the data by these differerent geographies. | |
| ```{r} | |
| length(unique(sea_data$first_hs_name)) | |
| length(unique(sea_data$first_hs_lea_id)) | |
| table(sea_data$coop_name_g7, useNA = "always") | |
| ``` | |
| For this exercise, districts in Montucky are organized into cooperative regions. | |
| Cooperative regions are just groups of LEAs. It may be helpful to compare how | |
| our model performs in a given school, LEA, or coop region to the rest of the | |
| data in the state. As an example of this kind of analysis, select a specific | |
| coop region and explore its data, drawing comparisons to the full dataset. Which | |
| districts are part of this coop region and how many students do they have? | |
| Substitute different abbreviation codes for different coops and then replace the | |
| `my_coop` variable below. | |
| ```{r} | |
| my_coop <- sea_data$coop_name_g7[50] # select the coop for the 50th observation | |
| # Which districts are in this coop and how many 7th graders do we have for each? | |
| table(sea_data$sch_g7_lea_id[sea_data$coop_name_g7 == my_coop], | |
| useNA = "always") | |
| # which schools? | |
| table(sea_data$sch_g7_name[sea_data$coop_name_g7 == my_coop], | |
| useNA = "always") | |
| ``` | |
| What student subgroups are we interested in? Let's start by looking at student subgroups. | |
| Here's whether a student is male. | |
| ```{r} | |
| table(sea_data$male, useNA="always") | |
| ``` | |
| Here's a short for loop to look at one-way tabs of a lot of variables at once. | |
| ```{r, eval=FALSE} | |
| for(i in c("male", "race_ethnicity", "frpl_7", "iep_7", "ell_7", | |
| "gifted_7")){ | |
| print(i) | |
| print(table(sea_data[, i], useNA="always")) | |
| } | |
| ``` | |
| Let's examine the distribution of student subgroups by geography. For this | |
| command, we'll use the same looping syntax from above, which lets you avoid | |
| repetition by applying commands to multiple variables at once. You can type | |
| `?for` into the R console if you want to learn more about how to use loops in R. | |
| ```{r, eval=FALSE} | |
| for(var in c("male", "race_ethnicity", "frpl_7", "iep_7", "ell_7", | |
| "gifted_7")){ | |
| print(var) | |
| print( # have to call print inside a loop | |
| round( # round the result | |
| prop.table( # convert table to percentages | |
| table(sea_data$coop_name_g7, sea_data[, var], # build the table | |
| useNA = "always"), | |
| margin = 1), # calculate percentages by column, change to 1 for row | |
| digits = 3) # round off at 3 digits | |
| *100 ) # put on percentage instead of proportion scale | |
| } | |
| ``` | |
| Now, let's look at high school outcomes. We won't examine them all, but you should. | |
| Here's the ontime high school graduation outcome variable we looked at above: | |
| ```{r} | |
| table(sea_data$ontime_grad, useNA = "always") | |
| ``` | |
| Wait! What if the data includes students who transferred out of state? That | |
| might bias the graduation rate and make it too low, because those 7th graders | |
| might show up as having dropped out. | |
| ```{r} | |
| table(sea_data$transferout, useNA = "always") | |
| table(transfer = sea_data$transferout, grad = sea_data$ontime_grad, useNA = "always") | |
| ``` | |
| This is another case where we may want to consider a business rule. How should | |
| students who transfer out be treated? We don't know whether they graduated | |
| or not. Should they be excluded from the analysis? Coded as not completing? | |
| The decision is yours, but it is important to consider all the possible high | |
| school outcomes when building a model and how the model will treat them. | |
| Let's look at the distribution of another outcome variable, `any_grad`, which | |
| includes late graduation and ontime graduation by both geography and by | |
| subgroup. | |
| ```{r crosstab_grad, eval=FALSE} | |
| round( | |
| prop.table( | |
| table(sea_data$coop_name_g7, sea_data$any_grad, useNA="always"), | |
| margin = 1 | |
| ), | |
| digits = 2) | |
| for(var in c("male", "race_ethnicity", "frpl_7", "iep_7", "ell_7", | |
| "gifted_7")){ | |
| print(var) | |
| print( | |
| prop.table( | |
| table(grad = sea_data$any_grad, var = sea_data[, var], | |
| useNA = "always"), | |
| margin = 1) | |
| ) | |
| } | |
| ``` | |
| ### Review Existing Indicator | |
| Now that we are oriented to the data, let's turn our attention to the model | |
| predictions provided by the vendor. First, let's check the format: | |
| ```{r coll_ready_tab} | |
| summary(sea_data$vendor_ews_score) | |
| ``` | |
| Instead of classifying students, each student receives a predicted probability | |
| for graduating. We need a way to convert this into a classification measure. | |
| One way to do this is to pick a threshold and declare all values greater than | |
| that threshold as being of the positive (graduation) class, and all values | |
| below as being members of the negative class. | |
| ```{r} | |
| set_thresh <- 0.5 | |
| conf_count <- table(observed = sea_data$ontime_grad, | |
| pred = sea_data$vendor_ews_score > set_thresh) | |
| conf_count | |
| prop.table(conf_count) | |
| ``` | |
| This matrix tells us, for the threshold we have selected, how many graduates | |
| we identified successfully, how many non-graduates we identified successfully, | |
| and how many mistakes we made in each direction (this is known as a confusion | |
| matrix and will be discussed at length at the workshop!). | |
| How does this compare with the vendor's marketing material and stated accuracy? | |
| Now, let's turn to identifying the predictors available for building an alternative | |
| model. Let's examine the performance and behavioral variables that you can | |
| use as predictors. These are mostly numerical variables, so you should use the | |
| summary, histogram, and table commands to explore them. Here's some syntax for | |
| examining 7th grade math scores. You can replicate and edit it to examine other | |
| potential predictors and their distributions by different subgroups. | |
| ```{r} | |
| summary(sea_data$scale_score_7_math) | |
| hist(sea_data$scale_score_7_math) | |
| ``` | |
| ```{r mean_score_by_demog} | |
| by(sea_data$scale_score_7_math, sea_data$coop_name_g7, FUN = mean, | |
| na.rm=TRUE) | |
| by(sea_data$scale_score_7_math, sea_data$frpl_7, FUN = mean, | |
| na.rm=TRUE) | |
| ``` | |
| Finally, here's some sample code you can use to look at missingness patterns in | |
| the data. Note we use the `is.na()` function to test whether a value is missing. | |
| ```{r missingness_checks} | |
| for(var in c("coop_name_g7", "male", "race_ethnicity")){ | |
| print(var) | |
| print( | |
| prop.table(table(sea_data[, var], | |
| "missing_math" = is.na(sea_data$pct_days_absent_7)), 1) | |
| ) | |
| } | |
| ``` | |
| Handling missing values is another case where business rules will come into play. | |
| Did you see any outlier or impossible values while you were exploring the data? | |
| If so, you might want to truncate them or change them to missing. Here's how you | |
| can replace a numeric variable with a missing value if it is larger than a | |
| certain number (in this case, 100 percent). | |
| ```{r abs_trunc} | |
| hist(sea_data$pct_days_absent_7) | |
| sea_data$pct_days_absent_7[sea_data$pct_days_absent_7 > 100] <- NA | |
| hist(sea_data$pct_days_absent_7) | |
| ``` | |
| Trimming the data in this way is another example of a business rule. You | |
| may wish to trim the absences even further in this data. You may also wish to | |
| assign a different value other than missing for unusual values - such as the | |
| mean or median value. | |
| Now that you've explored the data, you can start to examine the relationship | |
| between predictor and outcome variables. Here we'll continue to look at the high | |
| school graduation outcome, and we'll restrict the predictors to just two: 7th | |
| grade math scores and percent of enrolled days absent through 7th grade. For | |
| your model, you can of course use more and different predictor | |
| variables. First, check the correlation between outcome and predictors. | |
| ```{r} | |
| cor(sea_data[, c("ontime_grad", "scale_score_7_math", "pct_days_absent_7")], | |
| use = "pairwise.complete.obs") | |
| ``` | |
| These correlations do not look very promising! But remember, a correlation | |
| may not tell the whole story if other factors are involved, and correlations | |
| between binary variables and continuous variables are noisy indicators. | |
| It would be nice to have a better idea of | |
| the overall relationship between outcomes and predictors. | |
| But you can't make a meaningful scatterplot when the independent, or y value, is | |
| a binary outcome variable (try it!). Let's look at a technique to identify | |
| the relationship between a continuous variable and a binary outcome. | |
| The idea behind this code is to show the mean of the outcome variable for each | |
| value of the predictor, or for categories of the predictor variable if it has | |
| too many values. First, define categories (in this case, round to the nearest | |
| percentage) of the percent absent variable, and then truncate the variable so that | |
| low-frequency values are grouped together. | |
| ```{r} | |
| sea_data$pct_absent_cat <- round(sea_data$pct_days_absent_7, digits = 0) | |
| table(sea_data$pct_absent_cat) | |
| sea_data$pct_absent_cat[sea_data$pct_absenct_cat >= 30] <- 30 | |
| ``` | |
| Next, define a variable which is the average ontime graduation rate for each | |
| absence category, and then make a scatter plot of average graduation rates by | |
| absence percent. | |
| ```{r} | |
| sea_data <- sea_data %>% | |
| group_by(pct_absent_cat) %>% # perform the operation for each value | |
| mutate(abs_ontime_grad = mean(ontime_grad, na.rm = TRUE)) # add a new variable | |
| plot(sea_data$pct_absent_cat, sea_data$abs_ontime_grad) | |
| ``` | |
| You can do the same thing for 7th grade test scores. First look at the math | |
| test score and notice that some scores appear to be outliers. | |
| ```{r} | |
| hist(sea_data$scale_score_7_math) | |
| sea_data$scale_score_7_math[sea_data$scale_score_7_math < 0] <- NA | |
| hist(sea_data$scale_score_7_math) | |
| ``` | |
| You can do the same plot as above now by modifying the `group_by()` | |
| command. | |
| ```{r} | |
| sea_data <- sea_data %>% | |
| mutate(math_7_cut = ntile(scale_score_7_math, n = 100)) %>% | |
| group_by(math_7_cut) %>% # perform the operation for each value | |
| mutate(math_7_ontime_grad = mean(ontime_grad, na.rm=TRUE)) # add a new variable | |
| plot(sea_data$math_7_cut, sea_data$math_7_ontime_grad) | |
| ``` | |
| This is a neat trick you can use to communicate your model predictions as well. | |
| ## Model | |
| Now we're ready to call on the `glm` command to compute a logistic regression | |
| for the relationship between our binary outcome variable and our predictor | |
| variables. When you run a logistic regression, R calculates the parameters of an | |
| equation that fits the relationship between the predictor variables and the | |
| outcome. A regression model typically won't be able to explain all of the | |
| variation in an outcome variable--any variation that is left over is treated as | |
| unexplained noise in the data, or error, even if there are additional variables | |
| not in the model which could explain more of the variation. | |
| Once you've run a logit regression, you can have R generate a variable with new, | |
| predicted outcomes for each observation in your data with the `predict` command. | |
| The predictions are calculated using the model equation and ignore the | |
| unexplained noise in the data. For logit regressions, the predicted outcomes | |
| take the form of a probability ranging 0 and 1. To start with, let's do a | |
| regession of ontime graduation on eighth grade math scores. | |
| ```{r} | |
| math_model <- glm(ontime_grad ~ scale_score_7_math, data = sea_data, | |
| family = "binomial") # family tells R we want to fit a logistic | |
| ``` | |
| The default summary output for logistic regression in R is not very helpful. | |
| ```{r} | |
| summary(math_model) | |
| ``` | |
| Even before you use the predict command, you can use the logit output to learn | |
| something about the relationship between the predictor and the outcome variable. | |
| The Pseudo $R^{2}$ (read R-squared) is a proxy for the share of variation in the | |
| outcome variable that is explained by the predictor. Statisticians don't like it | |
| when you take the pseudo $R^{2}$ too seriously, but it can be useful in predictive | |
| exercises to quickly get a sense of the explanatory power of variables in a | |
| logit model. | |
| ```{r} | |
| logit_rsquared(math_model) | |
| ``` | |
| Does adding polynomial terms increase the pseudo $R^{2}$? You can use the formula | |
| interface in R to add functional transformations of predictors without generating | |
| new variables and find out. | |
| ```{r} | |
| math_model2 <- glm(ontime_grad ~ scale_score_7_math + | |
| I(scale_score_7_math^2) + I(scale_score_7_math^3), | |
| data = sea_data, | |
| family = "binomial") # family tells R we want to fit a logistic | |
| logit_rsquared(math_model2) | |
| ``` | |
| The model did not improve very much. Any time you add predictors to a model, | |
| the $R^{2}$ will increase, even if the variables are fairly meaningless, so it's | |
| best to focus on including predictors that add meaningful explanatory power. | |
| Now take a look at the $R^{2}$ for the absence variable. | |
| ```{r} | |
| absence_model <- glm(ontime_grad ~ pct_days_absent_7, data = sea_data, | |
| family = "binomial") | |
| summary(absence_model) | |
| logit_rsquared(absence_model) | |
| ``` | |
| Let's combine our two predictors and test their combined power. | |
| ```{r} | |
| combined_model <- glm(ontime_grad ~ pct_days_absent_7 + scale_score_7_math, | |
| data = sea_data, family = "binomial") | |
| summary(combined_model) | |
| logit_rsquared(combined_model) | |
| ``` | |
| Using this combined model, let's use the predict command to make our first | |
| predictions. | |
| ```{r} | |
| sea_data$grad_pred <- predict(combined_model, newdata = sea_data, | |
| type = "response") # this tells R to give us a probability | |
| ``` | |
| This generates a new variable with the probability of ontime high school | |
| graduation, according to the model. But if you look at the number of | |
| observations with predictions, you'll see that it is smaller than the total | |
| number of students. This is because R doesn't use observations that have | |
| missing data for any of the variables in the model. | |
| ```{r} | |
| table(is.na(sea_data$grad_pred)) | |
| ``` | |
| Let's convert this probability to a 0/1 indicator for whether or not a student | |
| is likely to graduate ontime. A good rule of thumb when starting out is to set | |
| the probability cutoff at the mean of the outcome variable. In this example, | |
| we can store this value as a variable: | |
| ```{r} | |
| basic_thresh <- mean(sea_data$ontime_grad) | |
| basic_thresh | |
| ``` | |
| If the probability in the model is equal to or | |
| greater than this threshold, we'll say the student is likely to graduate. | |
| ```{r} | |
| sea_data$grad_indicator <- ifelse(sea_data$grad_pred > basic_thresh, 1, 0) | |
| table(sea_data$grad_indicator, useNA = "always") | |
| ``` | |
| You can also plot the relationship between the probability and the outcome. | |
| Ideally, you should see the proportion of graduates steadily increase for each | |
| percentile of the probabilities. What does this relationship tell you? | |
| ```{r} | |
| sea_data <- sea_data %>% | |
| mutate(grad_pred_cut = ntile(grad_pred, n = 100)) %>% | |
| group_by(grad_pred_cut) %>% # perform the operation for each value | |
| mutate(grad_pred_cut_grad = mean(ontime_grad, na.rm=TRUE)) # add a new variable | |
| plot(sea_data$grad_pred_cut, sea_data$grad_pred_cut_grad) | |
| ``` | |
| Lets evaluate the accuracy of the model by comparing the predictions to the | |
| actual graduation outcomes for the students for whom we have predictions. This | |
| type of crosstab is called a "confusion matrix." The observations in the upper | |
| right corner, where the indicator and the actual outcome are both 0, are true | |
| negatives. The observations in the lower right corner, where the indicator and | |
| the outcome are both 1, are true positives. The upper right corner contains | |
| false positives, and the lower left corner contains false negatives. Overall, if | |
| you add up the cell percentages for true positives and true negatives, the model | |
| got 56.1 percent of the predictions right. | |
| ```{r} | |
| prop.table( | |
| table( | |
| grad = sea_data$ontime_grad, pred = sea_data$grad_indicator)) %>% # shorthand way to round | |
| round(3) | |
| ``` | |
| However, almost all of the wrong predictions are false negatives--these are | |
| students who have been flagged as dropout risks even though they | |
| did graduate ontime. If you want your indicator system to be have fewer false | |
| negatives, you can change the probability cutoff. This cutoff has a lower share | |
| of false positives and a higher share of false negatives, with a somewhat lower | |
| share of correct predictions. | |
| ```{r} | |
| new_thresh <- basic_thresh - 0.05 | |
| prop.table(table(Observed = sea_data$ontime_grad, | |
| Predicted = sea_data$grad_pred > new_thresh)) %>% | |
| round(3) | |
| ``` | |
| Note that this table only includes the complete cases. To look at missing values | |
| as well: | |
| ```{r} | |
| prop.table(table(Observed = sea_data$ontime_grad, | |
| Predicted = sea_data$grad_pred > new_thresh, | |
| useNA="always")) %>% round(3) | |
| ``` | |
| ## Missing Data | |
| Another key business rule is how we will handle students with missing data. A | |
| predictive analytics system is more useful if it makes an actionable prediction | |
| for every student. It is good to check, if it is available, the graduation rates | |
| for students with missing data: | |
| ```{r} | |
| table(Grad = sea_data$ontime_grad, | |
| miss_math = is.na(sea_data$scale_score_7_math)) | |
| table(Grad = sea_data$ontime_grad, | |
| miss_abs = is.na(sea_data$pct_days_absent_7)) | |
| ``` | |
| There are a number of options at this point. One is to run a model with fewer | |
| variables for only those students, and then use that model to fill in the | |
| missing indicators. | |
| ```{r} | |
| absence_model <- glm(ontime_grad ~ pct_days_absent_7, | |
| data = sea_data[is.na(sea_data$scale_score_7_math),], | |
| family = "binomial") | |
| ``` | |
| ```{r} | |
| sea_data$grad_pred_2 <- predict(absence_model, newdata = sea_data, | |
| type = "response") | |
| summary(absence_model) | |
| ``` | |
| ```{r} | |
| table(sea_data$grad_indicator, useNA="always") | |
| sea_data$grad_indicator[is.na(sea_data$grad_pred) & | |
| sea_data$grad_pred_2 < new_thresh] <- 0 | |
| sea_data$grad_indicator[is.na(sea_data$grad_pred) & | |
| sea_data$grad_pred_2 >= new_thresh] <- 1 | |
| table(sea_data$grad_indicator, useNA="always") | |
| ``` | |
| We now have predictions for all but a very small share of students, and those | |
| students are split between graduates and non-graduates. We have to apply a rule | |
| or a model to make predictions for them--we can't use information from the | |
| future, except to develop the prediction system. We'll arbitrarily decide to | |
| flag them as potential non-graduates, since students with lots of missing data | |
| might merit some extra attention. | |
| ```{r} | |
| table(sea_data$grad_indicator, sea_data$ontime_grad, useNA = "always") | |
| sea_data$grad_indicator[is.na(sea_data$grad_indicator)] <- 0 | |
| ``` | |
| ## Evaluate Fit | |
| Now we have a complete set of predictions from our simple models. How well does | |
| the prediction system work? Can we do better? | |
| ```{r} | |
| table(Observed = sea_data$ontime_grad, Predicted = sea_data$grad_indicator) %>% | |
| prop.table %>% round(4) | |
| ``` | |
| A confusion matrix is one way to evaluate the success of a model and evaluate | |
| tradeoffs as you are developing prediction systems, but there are others. We | |
| will cover these more in the workshop, but in cases where we have an uneven | |
| proportion of cases in each class (e.g. we have many more graduates than non-graduates), | |
| it can be helpful to look at a metric like the AUC, which stands for "area under the | |
| curve." You'll learn more about ways to evaluate a prediction system, including | |
| the AUC metric, during Day 1 of the workshop, but here's a sneak peak. | |
| First, use the `roc` command to plot the true positive rate (sensitivity in | |
| the graph) against the false positive rate (1-specificity in the graph). | |
| ```{r calculateROC} | |
| roc(sea_data$ontime_grad, sea_data$grad_indicator) %>% plot | |
| ``` | |
| You can also calculate ROC on the continuouse predictor as well, to help you | |
| determine the threshold: | |
| ```{r calculateROC2} | |
| roc(sea_data$ontime_grad, sea_data$grad_pred) %>% plot | |
| ``` | |
| You can also calculate the numeric summaries instead of just the graphs. To | |
| do this let's use the `caret` package: | |
| ```{r} | |
| # We must wrap these each in calls to factor because of how this function expects | |
| # the data to be formatted | |
| caret::confusionMatrix(data = factor(sea_data$grad_indicator), | |
| reference =factor(sea_data$ontime_grad), positive = "1") | |
| ``` | |
| A couple of last thoughts and notes. First, note that so far we haven't done any | |
| out-of-sample testing. We know from the pre-reading that we should never trust | |
| our model fit measures on data the model was fit to -- statistical models are | |
| overly confident. To combat this, you should subdivide your dataset. There are | |
| many strategies you can choose from depending on how much data you have and the | |
| nature of your problem - for the EWS case, we can use the first two cohorts to | |
| build our models and the latter two cohorts to evaluate that fit. | |
| Here is some code you can use to do that: | |
| ```{r} | |
| # In R we can define an index of rows so we do not have to copy our data | |
| train_idx <- row.names(sea_data[sea_data$year %in% c(2003, 2004, 2005),]) | |
| test_idx <- !row.names(sea_data) %in% train_idx | |
| fit_model <- glm(ontime_grad ~ scale_score_7_math + frpl_7, | |
| data = sea_data[train_idx, ], family = "binomial") | |
| sea_data$grad_pred_3 <- predict(fit_model, newdata = sea_data, type = "response") | |
| summary(sea_data$grad_pred_3) | |
| # Check the test index only | |
| summary(sea_data$grad_pred_3[test_idx]) | |
| # calculate matrix of the test_index | |
| table(Observed = sea_data$ontime_grad[test_idx], | |
| Predicted = sea_data$grad_pred_3[test_idx] > new_thresh) %>% | |
| prop.table() %>% round(4) | |
| ``` | |
| Second, should we use subgroup membership variables (such as demographics or | |
| school of enrollment?) to make predictions, if they improve the accuracy of | |
| predictions? This is more a policy question than a technical question, and you | |
| should consider it when you are developing your models. You'll also want to | |
| check to see how accurate your model is for different subgroups. | |
| ## Bonus | |
| The above code will be more than enough to get you started. But, if you want | |
| to reach further into predictive analytics, the next section provides some bonus | |
| syntax and advice for building more complex statistical models. | |
| Logistic regression is where you should start. It is fast to compute, easier to | |
| interpret, and usually does a great job. However, there are a number of alternative | |
| algorithms available to you and R provides a common interface to them through the | |
| `caret` package. The `train` function (as in, training the models) is the | |
| workhorse of the `caret` package. It has an extensive set of user-controlled | |
| options. | |
| When switching away from logistic regression, it can be advantageous to transform | |
| predictors to be centered at 0 with a standard deviation of 1. This helps | |
| put binary and continuous indicators on a more similar scale and helps avoid | |
| problems associated with rounding, large numbers, and optimization algorithms | |
| used to evaluate model fit. Here is an example of how you can do this in R: | |
| ```{r} | |
| library(caret) # machine learning workhorse in R | |
| sea_data <- as.data.frame(sea_data) | |
| # To use caret we have to manually expand our categorical variables. caret | |
| # can use formulas like lm() or glm() but becomes very slow and unreliable | |
| # for some model/algorithm methods when doing so. Better to do it ourselves. | |
| # Expand categorical variables | |
| # First declare any variable we want to be treated as a category as a factor | |
| sea_data$frpl_7 <- factor(sea_data$frpl_7) | |
| sea_data$male <- factor(sea_data$male) | |
| # Use the dummyVars function to build an expansion function to expand our data | |
| expand_factors <- caret::dummyVars(~ race_ethnicity + frpl_7 + male, | |
| data = sea_data) | |
| # Create a matrix of indicator variables by using the predict method for the | |
| # expand_factors object we just created. There are other ways you can do this, but | |
| # the advantage is we can use the `expand_factors` object on future/different | |
| # datasets to preserve factor levels that may not be present in those (like our | |
| # test set perhaps) | |
| fact_vars <- predict(expand_factors, sea_data) | |
| # Get the column names of our categorical dummy variables | |
| categ_vars <- colnames(fact_vars) | |
| # Combine the new dummy variables with our original dataset | |
| sea_data <- cbind(sea_data, fact_vars) | |
| # Drop the matrix of dummy variables | |
| rm(fact_vars) | |
| # Define the numeric/continuous predictors we want to use | |
| continuous_x_vars <- c('scale_score_7_math', 'scale_score_7_read', | |
| 'pct_days_absent_7', 'sch_g7_frpl_per') | |
| # caret does not gracefully handle missing data, so we have to do some additional | |
| # work when we define our training/test data split. You can choose additional | |
| # ways to address missing data (imputation, substituting mean values, etc.) - | |
| # here we opt for the simple but aggressive strategy of excluding any row | |
| # with missing values for any predictor from being in the training data | |
| train_idx <- row.names( # get the row.names to define our index | |
| na.omit(sea_data[sea_data$year %in% c(2003, 2004, 2005), | |
| # reduce the data to rows of specific years and not missing data | |
| c(continuous_x_vars, categ_vars)]) | |
| # only select the predictors we are interested in | |
| ) | |
| test_idx <- !row.names(sea_data[sea_data$year > 2005,]) %in% train_idx | |
| # All of our algorithms will run much faster and more smoothly if we center | |
| # our continuous measures at 0 with a standard deviation of 1. We have | |
| # skipped the important step of identifying and removing outliers, which we | |
| # should make sure to do, but is left up to you! | |
| pre_proc <- caret::preProcess(sea_data[train_idx, continuous_x_vars], | |
| method = c("scale", "center")) | |
| # We now have a pre-processing object which will scale and center our variables | |
| # for us. It will ignore any variables that are not defined within it, so we | |
| # can pass all of our continuous and dummy variables to it to produce our | |
| # final data frame of predictors. | |
| preds <- predict(pre_proc, | |
| sea_data[train_idx, # keep only the training rows | |
| c(continuous_x_vars, categ_vars)] | |
| ) # keep only the columns of dummy and continuous variables | |
| ``` | |
| Once we have defined our predictor variables, we need to tell `train` how we want | |
| to test our models. Most of the algorithms offered through `caret` have "tuning | |
| parameters", user-controlled values, that are not estimated from the data. Our | |
| goal is to experiment with these values and find the values that fit the data | |
| the best. To do this, we must tell `train` which values to try, and how to | |
| evaluate their performance. | |
| Luckily, `train()` has a number of sensible defaults that largely automate this | |
| process for us. For the purpose of this exercise, a good set of defaults is to | |
| use the `twoClassSummary()` model evaluation function (which tells us the area | |
| under the curve as well as the sensitivity, specificity, and accuracy) and | |
| to use repeated cross-fold validation. | |
| ```{r} | |
| # Take advantage of all the processing power on your machine | |
| library(doFuture) | |
| plan(multiprocess(workers = 8)) # define the number of cpus to use | |
| registerDoFuture() # register them with R | |
| # Caret really really really likes if you do binary classification that you | |
| # code the variables as factors with alphabetical labels. In this case, we | |
| # recode 0/1 to be nongrad, grad. | |
| yvar <- sea_data[train_idx, "ontime_grad"] # save only training observations | |
| yvar <- ifelse(yvar == 1, "grad", "nongrad") | |
| yvar <- factor(yvar) | |
| # On a standard desktop/laptop it can be necessary to decrease the sample size | |
| # to train in a reasonable amount of time. For the prototype and getting feedback | |
| # it's a good idea to stick with a reasonable sample size of under 30,000 rows. | |
| # Let's do that here: | |
| train_idx_small <- sample(1:nrow(preds), 2e4) | |
| # Caret has a number of complex options, you can read about under ?trainControl | |
| # Here we set some sensible defaults | |
| example_control <- trainControl( | |
| method = "cv", # we cross-validate our model to avoid overfitting | |
| classProbs = TRUE, # we want to be able to predict probabilities, not just binary outcomes | |
| returnData = TRUE, # we want to store the model data to allow for postestimation | |
| summaryFunction = prSummary, # we want to use the prSummary for better two-class accuracy measures | |
| trim = TRUE, # we want to reduce the size of the final model object to save RAM | |
| savePredictions = "final", # we want to store the predictions from the final model | |
| returnResamp = "final", # we want to store the resampling code to directly compare other methods | |
| allowParallel = TRUE # we want to use all the processors on our computer if possible | |
| ) | |
| # This will take quite some time: | |
| set.seed(2532) # set seed so models are comparable | |
| rpart_model <- train( | |
| y = yvar[train_idx_small], # specify the outcome, here subset | |
| # to the smaller number of observations | |
| x = preds[train_idx_small,], # specify the matrix of predictors, again subset | |
| method = "rpart", # choose the algorithm - here a regression tree | |
| tuneLength = 24, # choose how many values of tuning parameters to test | |
| trControl = example_control, # set up the conditions for the test (defined above) | |
| metric = "AUC" # select the metric to choose the best model based on | |
| ) | |
| set.seed(2532) | |
| # Repeat above but just change the `method` argument to nb for naiveBayes | |
| nb_model <- train(y = yvar[train_idx_small], | |
| x = preds[train_idx_small,], | |
| method = "nb", tuneLength = 24, | |
| trControl = example_control, | |
| metric = "AUC") | |
| # Construct a list of model performance comparing the two models directly | |
| resamps <- resamples(list(RPART = rpart_model, | |
| NB = nb_model)) | |
| # Summarize it | |
| summary(resamps) | |
| # plot it | |
| # see ?resamples for more options | |
| dotplot(resamps, metric = "AUC") | |
| ``` | |
| ## Test Your Results | |
| Now we need to evaluate our performance on our hold-out set of data. | |
| ```{r} | |
| # We use the pre-processing we defined on the training data, but this time | |
| # we create test_data with these values: | |
| test_data <- predict(pre_proc, | |
| sea_data[test_idx, # specify the rows in the test set | |
| c(continuous_x_vars, categ_vars)]) # keep the same vars | |
| # Create a vector of outcomes for the test set | |
| test_y <- sea_data[test_idx, "ontime_grad"] | |
| test_y <- ifelse(test_y == 1, "grad", "nongrad") | |
| test_y <- factor(test_y) | |
| confusionMatrix(reference = test_y, data = predict(rpart_model, test_data)) | |
| # Note that making predictions from the nb classifier can take a long time | |
| # consider alternative models or making fewer predictions if this is a bottleneck | |
| confusionMatrix(reference = test_y, data = predict(nb_model, test_data)) | |
| ``` | |
| We can also make ROC plots of our test-set, out of sample, prediction accuracy. | |
| ```{r} | |
| # ROC plots | |
| yhat <- predict(rpart_model, test_data, type = "prob") | |
| yhat <- yhat$grad | |
| roc(response = test_y, predictor = yhat <- yhat) %>% plot | |
| # NB ROC plot | |
| # Note that making predictions from the nb classifier can take a long time | |
| # consider alternative models or making fewer predictions if this is a bottleneck | |
| yhat <- predict(nb_model, test_data, type = "prob") | |
| yhat <- yhat$grad | |
| roc(response = test_y, predictor = yhat <- yhat) %>% plot | |
| ``` | |
| ## References and More | |
| The `caret` package has extensive documentation which you can use to find | |
| new models, learn about algorithms, and explore many other important aspects | |
| of building predictive analytics. You can learn more here: | |
| https://topepo.github.io/caret/ | |
  
    Sign up for free
    to join this conversation on GitHub.
    Already have an account?
    Sign in to comment