Skip to content

Instantly share code, notes, and snippets.

@fernandojunior
Last active August 23, 2016 05:24
Show Gist options
  • Save fernandojunior/252389050e17a45a95d911d75c2b0f33 to your computer and use it in GitHub Desktop.
Save fernandojunior/252389050e17a45a95d911d75c2b0f33 to your computer and use it in GitHub Desktop.
information gain entropy
# modified from https://github.com/philjette/InformationGain/blob/master/InformationGain.R
# https://philjette.wordpress.com/2015/05/31/feature-selection-using-information-gain-in-r/
library(plyr)
data(iris)
#compute Shannon entropy
entropy <- function(target) {
freq <- table(target)/length(target)
# vectorize
vec <- as.data.frame(freq)[,2]
#drop 0 to avoid NaN resulting from log2
vec<-vec[vec>0]
#compute entropy
-sum(vec * log2(vec))
}
#returns IG for numerical variables.
IG_numeric<-function(data, feature, target, bins=4) {
#Strip out rows where feature is NA
data<-data[!is.na(data[,feature]),]
#compute entropy for the parent
e0<-entropy(data[,target])
data$cat<-cut(data[,feature], breaks=bins, labels=c(1:bins))
dd_data<-ddply(data, "cat", here(summarise),
e=entropy(get(target)),
N=length(get(feature)),
min=min(get(feature)),
max=max(get(feature))
)
#calculate p for each value of feature
dd_data$p<-dd_data$N/nrow(data)
#compute IG
IG<-e0-sum(dd_data$p*dd_data$e)
return(IG)
}
#returns IG for categorical variables.
IG_cat<-function(data,feature,target){
#Strip out rows where feature is NA
data<-data[!is.na(data[,feature]),]
#use ddply to compute e and p for each value of the feature
dd_data<-ddply(data, feature, here(summarise), e=entropy(get(target)), N=length(get(target)))
#compute entropy for the parent
e0<-entropy(data[,target])
#calculate p for each value of feature
dd_data$p<-dd_data$N/nrow(data)
#compute IG
IG<-e0-sum(dd_data$p*dd_data$e)
return(IG)
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment