Created
September 28, 2012 18:39
-
-
Save jayyonamine/3801453 to your computer and use it in GitHub Desktop.
Calculating F-measure with Random Forest
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
################################# | |
##Programmer: James E. Yonamine | |
##Helpful edits: Peter Li | |
##Date: 9/2/2012 | |
##Random Forest Alpha Cutoff Function | |
################################# | |
##load pacakges and data | |
library(foreign) | |
library(MASS) | |
library(randomForest) | |
rm(list=ls()) | |
set.seed(2012) | |
data<-read.dta("master.dta") #make sure to set your working directory | |
data<-na.omit(data) | |
data.x<-data[,3:153] #extract covariates | |
data.x<-as.matrix(data.x) | |
for (i in 1:ncol(data.x)) { | |
data.x[,i] <- (data.x[,i] - mean(data.x[,i])) / sd(data.x[,i]) | |
} | |
data.y<-data$onset | |
#data.y.0 <- data.y - mean(data.y) | |
#y.matrix<-as.matrix(data.y.0) | |
data.y<-as.matrix(data.y) | |
data.x<-as.matrix(data.x) | |
x.train<-data.x[1:1960,] | |
x.test<-data.x[1961:3920,] | |
y.train<-data.y[1:1960,] | |
y.test<-data.y[1961:3920,] | |
train<-cbind(y.train, x.train) | |
test<-cbind(y.test, x.test) | |
##################################################### | |
##############Random Forest F-measure################# | |
##################################################### | |
#parameters user needs to specify | |
#y.train = vector of binary dependent variables to trianing set | |
#train = matrix of x covariates that correspond to y.train | |
#mtry = number of covariates you want to use for each tree | |
#sim = number of trees you want to grow | |
y.train<-y.train | |
x.train<-train | |
y.test<-y.test | |
sim<-100 | |
mtry=3 | |
####loop the random forest### | |
#normally, you would change the ntree value within the randomForest package. By running ntree=1 in a loop, we save the | |
#prediction of each tree for each observation into the history matrix. This allows us to see exactly how many of the n=sim | |
#trees predict a "0" or a "1" for each observation. | |
history <- matrix(data=0, nrow=nrow(test), ncol=sim) | |
for (k in 1:sim){ | |
rf.fit <- randomForest(as.factor(y.train)~., data=train, ntree=1, mtry=mtry) | |
rf.predict<-predict(rf.fit, newdata=x.test) | |
matrix<-as.matrix(rf.predict) | |
nrow(matrix) | |
history[,k]<-as.numeric(matrix[,1]) | |
} | |
sum<-as.matrix(rowSums(history)) | |
####calculate accuracy### | |
#Now we have the all of the tree predictions, we want to calculate our model's predictive accuracy at all possible cutpoints | |
#(i.e. if sim=100, then we will test accuracy with every integer cutpoint from 1-100). Performance is a matrix with each row | |
#corresponding to to an observation in the test set. The "rforest" column reflects how many of the nsim trees predict that | |
#observation to by of class "1". "y.hat" indicates the prediction based on the cutpoint. y=the true class. | |
performance<-matrix(data=0, nrow=nrow(sum), ncol=7) | |
colnames(performance)<-c("rforest", "y.hat", "y", "true.pos", "true.neg", "false.pos", "false.neg") | |
performance[,1]<-sum | |
performance[,3]<-y.test | |
#the compare matrix stores the accuracy scores for each cutpoint. Each row reflects the accuracy of each cutpoint. | |
compare<-matrix(data=0, nrow=sim, ncol=8) | |
colnames(compare)<-c("cutoff", "precision", "recall", "sensitivity", "specificity", "PPV", "NPV", "break") | |
for (j in 1:sim){ | |
performance[, 2] <- performance[, 1] > j #sets the cutoff iteratively from 1-sim | |
performance[,4] <-performance[,2] == 1 & performance[,3]==1 | |
performance[,5] <-performance[,2] == 0 & performance[,3]==0 | |
performance[,6] <-performance[,2] == 1 & performance[,3]==0 | |
performance[,7] <-performance[,2] == 0 & performance[,3]==1 | |
true.ones<-sum(performance[,3]) | |
true.zeros<-nrow(performance)-true.ones | |
precision<-sum(performance[,4])/sum(performance[,2]) | |
recall<-sum(performance[,4])/sum(performance[,3]) | |
sensitivity<-sum(performance[,4])/true.ones | |
specificity<-sum(performance[,5])/true.zeros | |
pos.pred.value<-sum(performance[,4])/(sum(performance[,4])+sum(performance[,6])) | |
neg.pred.value<-sum(performance[,5])/(sum(performance[,5])+sum(performance[,7])) | |
compare[j,1]<-j | |
compare[j,2]<-precision | |
compare[j,3]<-recall | |
compare[j,4]<-sensitivity | |
compare[j,5]<-specificity | |
compare[j,6]<-pos.pred.value | |
compare[j,7]<-neg.pred.value | |
compare[j,8]<-abs(precision-recall) | |
} | |
compare[is.na(compare)]<-0 | |
sub<-subset(compare, compare[,2]>.0) #this eliminates cutpoints that predict all "1's" or all "0's". | |
#If all "1's" or all "0's" gives the best prediction, then why did you run a model in the first place??? | |
F<-which.min(sub[,8]) #the precision-recall will rarely = 0, so we want the cutpoint that gets us the closest to 0. | |
###Print the results## | |
F # this is the cutpoint | |
compare[F, 4] | |
compare[F, 5] | |
compare[F, 6] | |
compare[F, 7] |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment