kkraoj · July 11, 2019 07:28
diff --git a/cv_wrong.rmd b/cv_wrong.rmd
 ```{r setup, include=FALSE}
 rm(list=ls())
 knitr::opts_chunk$set(echo = TRUE)
 library(caret)
 library(ggplot2)

 ```

 # Cross validation Example: The wrong way and the right way

 ```{r make dataset}

 set.seed(6)

 n.features = 1e5 #total features in data (genes)
 n.features.select = 1e2 #features to be selected after screening
 n.examples = 50 #number of examples (or patients)

 #create completely random labels of occurence of heart disease in patients
 labels = round(runif(n.examples, min = 1, max = 2))
 data = data.frame(round(matrix(runif(n.examples*n.features, min = 1, max =2), 
  n.examples, n.features)))
 data$y <- as.factor(labels)

 ```

 ```{r subset features}

 #create a function to select the best features as per their correlation with the disease
 best.subset <- function(data, n.features.select = 50){
  data$y <- as.numeric(data$y)
  correlations <- apply( data[,-which(names(data) == "y")] , 2 , cor , y = data$y )
  selected.features <- order(correlations, decreasing = TRUE)[1:n.features.select]
  selected.features <- names(correlations[selected.features])
  selected.data <- data[,c(selected.features,'y')]
  selected.data$y <- as.factor(selected.data$y)
  return(selected.data)
  }

 selected.data <- best.subset(data, n.features.select)
 ```

 ```{r fit model to selected features. WRONG way of doing it}

 # define training control
 folds <- 5
 fold.size <- dim(data)[1]/folds
 train_control <- trainControl(method = "cv", number = folds)

 # train the model on training set
 model <- train(y ~ .,data = selected.data,
               trControl = train_control, method = 'naive_bayes')
 # print(model)
 sprintf('Classification accuracy when CV is performed after subset selection= %0.0f %%', 
  100*model$results$Accuracy[1])

 ```
 [1] "Classification accuracy when CV is performed after subset selection= 99%"

 This is called cherry picking data. It is a completely inaccurate representation
 of model error. For true representation of model accuracy, the model should not 
 "peek" into the validation set at all - which means feature selection must be 
 performed only after leaving out one fold of the data.
	```{r setup, include=FALSE}
	rm(list=ls())
	knitr::opts_chunk$set(echo = TRUE)
	library(caret)
	library(ggplot2)

	```

	# Cross validation Example: The wrong way and the right way

	```{r make dataset}

	set.seed(6)

	n.features = 1e5 #total features in data (genes)
	n.features.select = 1e2 #features to be selected after screening
	n.examples = 50 #number of examples (or patients)

	#create completely random labels of occurence of heart disease in patients
	labels = round(runif(n.examples, min = 1, max = 2))
	data = data.frame(round(matrix(runif(n.examples*n.features, min = 1, max =2),
	n.examples, n.features)))
	data$y <- as.factor(labels)

	```

	```{r subset features}

	#create a function to select the best features as per their correlation with the disease
	best.subset <- function(data, n.features.select = 50){
	data$y <- as.numeric(data$y)
	correlations <- apply( data[,-which(names(data) == "y")] , 2 , cor , y = data$y )
	selected.features <- order(correlations, decreasing = TRUE)[1:n.features.select]
	selected.features <- names(correlations[selected.features])
	selected.data <- data[,c(selected.features,'y')]
	selected.data$y <- as.factor(selected.data$y)
	return(selected.data)
	}

	selected.data <- best.subset(data, n.features.select)
	```

	```{r fit model to selected features. WRONG way of doing it}

	# define training control
	folds <- 5
	fold.size <- dim(data)[1]/folds
	train_control <- trainControl(method = "cv", number = folds)

	# train the model on training set
	model <- train(y ~ .,data = selected.data,
	trControl = train_control, method = 'naive_bayes')
	# print(model)
	sprintf('Classification accuracy when CV is performed after subset selection= %0.0f %%',
	100*model$results$Accuracy[1])

	```
	[1] "Classification accuracy when CV is performed after subset selection= 99%"

	This is called cherry picking data. It is a completely inaccurate representation
	of model error. For true representation of model accuracy, the model should not
	"peek" into the validation set at all - which means feature selection must be
	performed only after leaving out one fold of the data.