Last active
November 15, 2016 15:09
-
-
Save seankross/f8de4cd48fa325e8983c52d6b366ae1b to your computer and use it in GitHub Desktop.
iris ROC
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| library(dplyr) | |
| library(forcats) | |
| library(ggplot2) | |
| iris.small <- datasets::iris %>% | |
| dplyr::filter(Species != "setosa") %>% | |
| dplyr::mutate(Species = fct_drop(Species)) %>% | |
| dplyr::group_by(Species) %>% | |
| dplyr::summarize(avg_sl = mean(Sepal.Length), | |
| avg_sw = mean(Sepal.Width), | |
| avg_pl = mean(Petal.Length), | |
| avg_pw = mean(Petal.Width)) | |
| set.seed(2016-11-14) | |
| iris.big <- data_frame(Species = as.factor(c(rep("versicolor", 500), rep("virginica", 500))), | |
| sl = c(rnorm(500, iris.small$avg_sl[1]), rnorm(500, iris.small$avg_sl[2])), | |
| sw = c(rnorm(500, iris.small$avg_sw[1]), rnorm(500, iris.small$avg_sw[2])), | |
| pl = c(rnorm(500, iris.small$avg_pl[1]), rnorm(500, iris.small$avg_pl[2])), | |
| pw = c(rnorm(500, iris.small$avg_pw[1]), rnorm(500, iris.small$avg_pw[2]))) | |
| train_fraction <- 0.5 #fraction of data for training purposes | |
| n_obs <- nrow(iris.big) | |
| train_size <- floor(train_fraction * nrow(iris.big)) | |
| train_indices <- sample(n_obs, size=train_size, replace=TRUE) #sample(x, size, replace = FALSE, prob = NULL)x Either a (numeric, complex, character or logical) vector of more than one element from which to choose, or a positive integer.size non-negative integer giving the number of items to choose. replace Should sampling be with replacement? prob A vector of probability weights for obtaining the elements of the vector being sampled | |
| train_data <- iris.big[train_indices, ] | |
| test_data <- iris.big[-train_indices, ] | |
| glm.out.train <- glm(Species ~ sl + sw + pl + pw, data=train_data, family = "binomial") | |
| test_pred <- predict(glm.out.train, test_data, type='response') | |
| calc_ROC <- function(probabilities, known_truth, model.name=NULL) | |
| { | |
| outcome <- as.numeric(factor(known_truth))-1 | |
| pos <- sum(outcome) # total known positives | |
| neg <- sum(1-outcome) # total known negatives | |
| pos_probs <- outcome*probabilities # probabilities for known positives | |
| neg_probs <- (1-outcome)*probabilities # probabilities for known negatives | |
| true_pos <- sapply(probabilities, | |
| function(x) sum(pos_probs>=x)/pos) # true pos. rate | |
| false_pos <- sapply(probabilities, | |
| function(x) sum(neg_probs>=x)/neg) | |
| if (is.null(model.name)) | |
| result <- data.frame(true_pos, false_pos) | |
| else | |
| result <- data.frame(true_pos, false_pos, model.name) | |
| result %>% dplyr::arrange(false_pos, true_pos) | |
| } | |
| ROC.train <- calc_ROC(probabilities=test_pred, known_truth=train_data$Species, model.name="train") | |
| ROC.test <- calc_ROC(probabilities=test_pred, known_truth=test_data$Species, model.name="test") | |
| ROCs <- rbind(ROC.train, ROC.test) | |
| ggplot(ROCs, aes(x=false_pos, y=true_pos, color=model.name)) + geom_line() + xlim(0, 0.25) | |
| ROCs %>% dplyr::group_by(model.name) %>% dplyr::mutate(delta=false_pos-lag(false_pos)) %>% dplyr::summarize(AUC=sum(delta*true_pos, na.rm=T)) %>% dplyr::arrange(desc(AUC)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment