seankross · November 15, 2016 15:09
diff --git a/test.R b/test.R
 library(dplyr)
 library(forcats)
 library(ggplot2)

 iris.small <- datasets::iris %>%
  dplyr::filter(Species != "setosa") %>%
  dplyr::mutate(Species = fct_drop(Species)) %>%
  dplyr::group_by(Species) %>%
  dplyr::summarize(avg_sl = mean(Sepal.Length),
            avg_sw = mean(Sepal.Width),
            avg_pl = mean(Petal.Length),
            avg_pw = mean(Petal.Width))

 set.seed(2016-11-14)

 iris.big <- data_frame(Species = as.factor(c(rep("versicolor", 500), rep("virginica", 500))),
                       sl = c(rnorm(500, iris.small$avg_sl[1]), rnorm(500, iris.small$avg_sl[2])),
                       sw = c(rnorm(500, iris.small$avg_sw[1]), rnorm(500, iris.small$avg_sw[2])),
                       pl = c(rnorm(500, iris.small$avg_pl[1]), rnorm(500, iris.small$avg_pl[2])),
                       pw = c(rnorm(500, iris.small$avg_pw[1]), rnorm(500, iris.small$avg_pw[2])))

 train_fraction <- 0.5 #fraction of data for training purposes
 n_obs <- nrow(iris.big)
 train_size <- floor(train_fraction * nrow(iris.big))
 train_indices <- sample(n_obs, size=train_size, replace=TRUE)  #sample(x, size, replace = FALSE, prob = NULL)x	Either a (numeric, complex, character or logical) vector of more than one element from which to choose, or a positive integer.size	non-negative integer giving the number of items to choose. replace	Should sampling be with replacement? prob	A vector of probability weights for obtaining the elements of the vector being sampled
 train_data <- iris.big[train_indices, ]
 test_data <- iris.big[-train_indices, ]
 glm.out.train <- glm(Species ~ sl + sw + pl + pw, data=train_data, family = "binomial")
 test_pred <- predict(glm.out.train, test_data, type='response')
 calc_ROC <- function(probabilities, known_truth, model.name=NULL)
 {
  outcome <- as.numeric(factor(known_truth))-1
  pos <- sum(outcome) # total known positives
  neg <- sum(1-outcome) # total known negatives
  pos_probs <- outcome*probabilities # probabilities for known positives
  neg_probs <- (1-outcome)*probabilities # probabilities for known negatives
  true_pos <- sapply(probabilities,
                     function(x) sum(pos_probs>=x)/pos) # true pos. rate
  false_pos <- sapply(probabilities,
                      function(x) sum(neg_probs>=x)/neg)
  if (is.null(model.name))
    result <- data.frame(true_pos, false_pos)
  else
    result <- data.frame(true_pos, false_pos, model.name)
  result %>% dplyr::arrange(false_pos, true_pos)
 }
 ROC.train <- calc_ROC(probabilities=test_pred, known_truth=train_data$Species, model.name="train")
 ROC.test <- calc_ROC(probabilities=test_pred, known_truth=test_data$Species, model.name="test")
 ROCs <- rbind(ROC.train, ROC.test)
 ggplot(ROCs, aes(x=false_pos, y=true_pos, color=model.name)) + geom_line() + xlim(0, 0.25)
 ROCs %>% dplyr::group_by(model.name) %>% dplyr::mutate(delta=false_pos-lag(false_pos)) %>% dplyr::summarize(AUC=sum(delta*true_pos, na.rm=T)) %>% dplyr::arrange(desc(AUC))
	library(dplyr)
	library(forcats)
	library(ggplot2)

	iris.small <- datasets::iris %>%
	dplyr::filter(Species != "setosa") %>%
	dplyr::mutate(Species = fct_drop(Species)) %>%
	dplyr::group_by(Species) %>%
	dplyr::summarize(avg_sl = mean(Sepal.Length),
	avg_sw = mean(Sepal.Width),
	avg_pl = mean(Petal.Length),
	avg_pw = mean(Petal.Width))

	set.seed(2016-11-14)

	iris.big <- data_frame(Species = as.factor(c(rep("versicolor", 500), rep("virginica", 500))),
	sl = c(rnorm(500, iris.small$avg_sl[1]), rnorm(500, iris.small$avg_sl[2])),
	sw = c(rnorm(500, iris.small$avg_sw[1]), rnorm(500, iris.small$avg_sw[2])),
	pl = c(rnorm(500, iris.small$avg_pl[1]), rnorm(500, iris.small$avg_pl[2])),
	pw = c(rnorm(500, iris.small$avg_pw[1]), rnorm(500, iris.small$avg_pw[2])))

	train_fraction <- 0.5 #fraction of data for training purposes
	n_obs <- nrow(iris.big)
	train_size <- floor(train_fraction * nrow(iris.big))
	train_indices <- sample(n_obs, size=train_size, replace=TRUE) #sample(x, size, replace = FALSE, prob = NULL)x Either a (numeric, complex, character or logical) vector of more than one element from which to choose, or a positive integer.size non-negative integer giving the number of items to choose. replace Should sampling be with replacement? prob A vector of probability weights for obtaining the elements of the vector being sampled
	train_data <- iris.big[train_indices, ]
	test_data <- iris.big[-train_indices, ]
	glm.out.train <- glm(Species ~ sl + sw + pl + pw, data=train_data, family = "binomial")
	test_pred <- predict(glm.out.train, test_data, type='response')
	calc_ROC <- function(probabilities, known_truth, model.name=NULL)
	{
	outcome <- as.numeric(factor(known_truth))-1
	pos <- sum(outcome) # total known positives
	neg <- sum(1-outcome) # total known negatives
	pos_probs <- outcome*probabilities # probabilities for known positives
	neg_probs <- (1-outcome)*probabilities # probabilities for known negatives
	true_pos <- sapply(probabilities,
	function(x) sum(pos_probs>=x)/pos) # true pos. rate
	false_pos <- sapply(probabilities,
	function(x) sum(neg_probs>=x)/neg)
	if (is.null(model.name))
	result <- data.frame(true_pos, false_pos)
	else
	result <- data.frame(true_pos, false_pos, model.name)
	result %>% dplyr::arrange(false_pos, true_pos)
	}
	ROC.train <- calc_ROC(probabilities=test_pred, known_truth=train_data$Species, model.name="train")
	ROC.test <- calc_ROC(probabilities=test_pred, known_truth=test_data$Species, model.name="test")
	ROCs <- rbind(ROC.train, ROC.test)
	ggplot(ROCs, aes(x=false_pos, y=true_pos, color=model.name)) + geom_line() + xlim(0, 0.25)
	ROCs %>% dplyr::group_by(model.name) %>% dplyr::mutate(delta=false_pos-lag(false_pos)) %>% dplyr::summarize(AUC=sum(delta*true_pos, na.rm=T)) %>% dplyr::arrange(desc(AUC))
No results found