Skip to content

Instantly share code, notes, and snippets.

@seankross
Last active November 15, 2016 15:09
Show Gist options
  • Select an option

  • Save seankross/f8de4cd48fa325e8983c52d6b366ae1b to your computer and use it in GitHub Desktop.

Select an option

Save seankross/f8de4cd48fa325e8983c52d6b366ae1b to your computer and use it in GitHub Desktop.
iris ROC
library(dplyr)
library(forcats)
library(ggplot2)
iris.small <- datasets::iris %>%
dplyr::filter(Species != "setosa") %>%
dplyr::mutate(Species = fct_drop(Species)) %>%
dplyr::group_by(Species) %>%
dplyr::summarize(avg_sl = mean(Sepal.Length),
avg_sw = mean(Sepal.Width),
avg_pl = mean(Petal.Length),
avg_pw = mean(Petal.Width))
set.seed(2016-11-14)
iris.big <- data_frame(Species = as.factor(c(rep("versicolor", 500), rep("virginica", 500))),
sl = c(rnorm(500, iris.small$avg_sl[1]), rnorm(500, iris.small$avg_sl[2])),
sw = c(rnorm(500, iris.small$avg_sw[1]), rnorm(500, iris.small$avg_sw[2])),
pl = c(rnorm(500, iris.small$avg_pl[1]), rnorm(500, iris.small$avg_pl[2])),
pw = c(rnorm(500, iris.small$avg_pw[1]), rnorm(500, iris.small$avg_pw[2])))
train_fraction <- 0.5 #fraction of data for training purposes
n_obs <- nrow(iris.big)
train_size <- floor(train_fraction * nrow(iris.big))
train_indices <- sample(n_obs, size=train_size, replace=TRUE) #sample(x, size, replace = FALSE, prob = NULL)x Either a (numeric, complex, character or logical) vector of more than one element from which to choose, or a positive integer.size non-negative integer giving the number of items to choose. replace Should sampling be with replacement? prob A vector of probability weights for obtaining the elements of the vector being sampled
train_data <- iris.big[train_indices, ]
test_data <- iris.big[-train_indices, ]
glm.out.train <- glm(Species ~ sl + sw + pl + pw, data=train_data, family = "binomial")
test_pred <- predict(glm.out.train, test_data, type='response')
calc_ROC <- function(probabilities, known_truth, model.name=NULL)
{
outcome <- as.numeric(factor(known_truth))-1
pos <- sum(outcome) # total known positives
neg <- sum(1-outcome) # total known negatives
pos_probs <- outcome*probabilities # probabilities for known positives
neg_probs <- (1-outcome)*probabilities # probabilities for known negatives
true_pos <- sapply(probabilities,
function(x) sum(pos_probs>=x)/pos) # true pos. rate
false_pos <- sapply(probabilities,
function(x) sum(neg_probs>=x)/neg)
if (is.null(model.name))
result <- data.frame(true_pos, false_pos)
else
result <- data.frame(true_pos, false_pos, model.name)
result %>% dplyr::arrange(false_pos, true_pos)
}
ROC.train <- calc_ROC(probabilities=test_pred, known_truth=train_data$Species, model.name="train")
ROC.test <- calc_ROC(probabilities=test_pred, known_truth=test_data$Species, model.name="test")
ROCs <- rbind(ROC.train, ROC.test)
ggplot(ROCs, aes(x=false_pos, y=true_pos, color=model.name)) + geom_line() + xlim(0, 0.25)
ROCs %>% dplyr::group_by(model.name) %>% dplyr::mutate(delta=false_pos-lag(false_pos)) %>% dplyr::summarize(AUC=sum(delta*true_pos, na.rm=T)) %>% dplyr::arrange(desc(AUC))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment