Last active
December 24, 2015 08:49
-
-
Save wallingTACC/6773447 to your computer and use it in GitHub Desktop.
Austin R Users Group Meetup - FasteR Naive Bayes
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(tm) #Text Mining | |
library(klaR) #Naive Bayes | |
library(parallel) #RCore parallel functions | |
library(data.table) #Improved version of data.frame with optimized helper methods like fread | |
library(caret) #Help us analyze the results | |
library(ggplot2) #Grammer for Graphics, very popular package | |
library(profr) | |
#----- | |
# Data: Download 20 newsgroup dataset here: http://www.cs.cmu.edu/afs/cs.cmu.edu/project/theo-20/www/data/news20.html | |
#---- | |
#------------------------------------------------------ | |
# 1: Piece everything together | |
#------------------------------------------------------ | |
run.nBayes <- function(dir.path) { | |
set.seed(23452) #Repeatable randomness | |
options(mc.cores=4) #Sets value globally, important for 3rd party packages | |
#Create initial dataset | |
all.data <- create.dataset.parallel(dir.path) | |
#Split into training vs test | |
training.indexes <- sample(nrow(all.data), nrow(all.data)*0.9) #Randomly select docs for our training set | |
training.data <- all.data[training.indexes,] #Subset all columns for matching rows | |
test.data <- all.data[-training.indexes,] # '-' gives non-matches | |
#Create TDM from just training data | |
training.tdm <- get.tdm(training.data$.text) | |
#Based on TDM, create doc 'term' vars | |
training.data <- create.doc.vars.parallel(training.data, training.tdm) | |
test.data <- create.doc.vars.parallel(test.data, training.tdm) | |
#Train our model | |
trained.model <- train.klaR.nBayes(training.data) | |
#Test & Analyze | |
predictions <- classify.klaR.nBayes.parallel(test.data, trained.model) | |
predicted <- predictions$class | |
actual <- test.data$.class | |
return(list(predicted=predicted, actual=actual)) | |
} | |
#------------------------------------------------------ | |
# 2: Process a given document to extract information | |
#------------------------------------------------------ | |
get.post.words <- function(file.path) { | |
#Returns all the words in the 'subject' and 'body' of a newsgroup post located at file.path and extracts the 'class' | |
#Extract newsgroup 'class' by taking next to last of the path when split by directory delimiter | |
#The subdir specifies the 'class', the files represent a single post | |
parts.list <- strsplit(file.path,"/") #Assumes using "/" delimiter in the path parameter!!! | |
newsgroup <- parts.list[[1]][length(parts.list[[1]])-1] #This shows how to index a list object | |
filename <- parts.list[[1]][length(parts.list[[1]])] | |
#Now get contents from the file | |
con <- file(file.path, open="rt", encoding="latin1") #Read in text mode | |
text.list <- readLines(con) #Now we have all the text, one line per row in the text list | |
close(con) | |
#Get the subject w/o the 'Subject:' part. This line will always start with 'Subject:' | |
subject <- sub("Subject:", "", grep("Subject:", text.list, value=T)) | |
#Body is everything after the first blank line | |
blank.index <- min(which(nchar(text.list)==0)) #'which' returns the indices where the statement is TRUE | |
body.list <- text.list[blank.index+1:length(text.list)] #Example of subsetting a list | |
cleansed.body.list <- gsub("\n", "", body.list) #Remove newlines | |
cleansed.body.list <- cleansed.body.list[!is.na(cleansed.body.list)] #Remove NAs | |
#Get all the text as single variable | |
all.text <- paste(subject, cleansed.body.list, collapse=" ") #Collapse to a single 'string'. paste = concatenate | |
#Save the filename, class and text. filename is useful for debugging, consider dropping in large runs | |
result <- list(.file=filename, .class=newsgroup, .text=all.text) #A named 'list' is more efficient than a data.frame in many cases. Use '.' so we don't have columns with same name as we add 'term' columns later | |
return(result) #The return statement is not required, but makes code more readable | |
} | |
#------------------------------------------------------ | |
# 3: Process directories | |
#------------------------------------------------------ | |
create.dataset <- function(dir.path) { | |
all.files <- list.files(dir.path, recursive=T, full.names=T) | |
data.list <- lapply(all.files, function(i) get.post.words(i)) #Iterate over file paths in all.files, and process each file | |
result <- do.call(rbind.data.frame, data.list) #Turn the list of lists into a data.frame | |
result$.text <- as.character(result$.text) #data.frame by default turns all char to factor | |
return(result) | |
} | |
create.dataset.parallel <- function(dir.path) { | |
#A parallelized implementation of the above | |
all.files <- list.files(dir.path, recursive=T, full.names=T) | |
data.list <- mclapply(all.files, function(i) get.post.words(i)) #mclapply from package 'parallel', assigns files to each available core | |
result <- rbindlist(data.list) #data.table package optimized implementation of lists -> data.table. data.table inherits from data.frame and provides optimizations | |
result$.class <- as.factor(result$.class) #rbindlist leaves char as char, but we want .class as.factor | |
return(result) | |
} | |
#------------------------------------------------------ | |
# 4: Build TermDocumentMatrix for a given Corpus of text | |
#------------------------------------------------------ | |
get.tdm <- function(doc.vec, minDocFreq=min(10, length(doc.vec)/10)) { | |
doc.corpus <- Corpus(VectorSource(doc.vec)) | |
#The order of the options in the following list can be changed, but will cause different results. Each option is 'executed' in the order given | |
control <- list(tolower=TRUE, | |
removePunctuation=TRUE, | |
removeNumbers=TRUE, | |
stopwords=TRUE, | |
bounds=list(global=c(minDocFreq,Inf)), #Sets the min/max number of docs in which a given term occurs | |
weighting=weightTf #Other options are available, such as TD-IDF | |
) | |
#TermDocumentMatrix uses mclapply under the hood, yay!!! | |
doc.tdm <- TermDocumentMatrix(doc.corpus, control) | |
return(doc.tdm) | |
} | |
#------------------------------------------------------ | |
# 5: Translate the TDMs for a given doc into column variables | |
#------------------------------------------------------ | |
create.doc.vars.parallel <- function(data, training.tdm) { | |
#Returns a vector word occurance from the corpus based variables | |
#Create a seperate tdm for each of the docs | |
doc.tdm.list <- mclapply(data$.text, function(i) get.tdm(i, minDocFreq=1)) | |
#Get the indexes of the matching terms, one result per doc | |
training.terms <- training.tdm$dimnames$Terms | |
#Matches are the indexes in doc.tdm.list terms that match the training.terms corpus | |
matches.list <- mclapply(doc.tdm.list, function(i) match(training.terms, i$dimnames$Terms)) | |
#Create variables based on those matches | |
vars.list <- mclapply(1:length(matches.list), function(i) { | |
r <- sapply(1:length(matches.list[[i]]), function(j) { #sapply is lapply that is 'simplified' to a vector | |
val <- doc.tdm.list[[i]]$v[matches.list[[i]][j]] | |
ifelse(is.na(val), 'F', 'T') | |
}) | |
return(r) | |
}) | |
#Mmapply | |
# test.list <- mcmapply(FUN=function(i, j) { #sapply is lapply that is 'simplified' to a vector | |
# val <- i$v[j] | |
# ifelse(is.na(val), 'F', 'T') | |
# }, doc.tdm.list, matches.list) | |
#Only use lapply, task is small enough that the overhead of mclapply is not warranted | |
named.vars <- lapply(vars.list, function(i) setNames(i, training.tdm$dimnames$Terms)) | |
named.vars.df <- data.frame(do.call(rbind, named.vars)) #Create a data.table which we can combine with the original. Note: can't use rbindlist as it expects a list of lists/data.frame/data.table, we have a character vector | |
result <- data.frame(c(data, named.vars.df)) | |
return(result) | |
} | |
#------------------------------------------------------ | |
# 6: Train and utilize our Naive Bayes model using klaR directly | |
#------------------------------------------------------ | |
train.klaR.nBayes <- function(training.data) { | |
training.data.sub <- subset(training.data, select=-c(.file,.text)) #Remove the unused columns | |
trained.model <- NaiveBayes(formula=.class ~ ., | |
data=training.data.sub, | |
fL=0, #No laplace correction, we don't have numeric data | |
usekernel=F) #Use gaussian | |
return(trained.model) | |
} | |
classify.klaR.nBayes.parallel <- function(test.data, trained.model) { | |
test.data.sub <- subset(test.data, select=-c(.file,.class,.text)) #Remove unused values | |
result.list <- mclapply(1:nrow(test.data.sub), function(i) fastpredict.NaiveBayes(trained.model, test.data.sub[i,])) | |
#Reformat the results to match basic predict.NaiveBayes return which is a named list(class, posterior) with X=nrow(test.data) list items | |
classes <- sapply(result.list, function(i) i$class) | |
posteriors <- t(sapply(result.list, function(i) i$posterior)) | |
result.formatted <- list(class=classes, posterior=posteriors) | |
rownames(result.formatted$posterior) <- 1:nrow(test.data) | |
colnames(result.formatted$posterior) <- colnames(result.list[[1]]$posterior) | |
return(result.formatted) | |
} | |
#------------------------------------------------------ | |
# 7: Use caret to analyze results | |
#------------------------------------------------------ | |
analyze.results <- function(predicted, actual) { | |
cm <- confusionMatrix(data=predicted, reference=actual) #Using package 'caret' | |
values <- cm$byClass[,1] #First column from all rows | |
classes <- sub('Class: ', '', rownames(cm$byClass)) | |
par(mar=c(12,4,2,2)) #Extra large bottom margin | |
#This would go to Rplots.pdf in batch mode (ex. with Rscript) | |
#Use getOption("device") to see default device for the given environment (normally windows() or X11()) | |
barplot(height=values, | |
names.arg=classes, | |
las=2, | |
ylab='Sensitivity', | |
main='Sensitivity by Newsgroup', | |
col=rainbow(length(classes)) | |
) | |
#A ggplot example | |
#Very popular package, more control over how graphics are built up, good 'gallery' and documentation | |
plot.data <- data.frame(value=values, class=classes) | |
plot.ggplot <- ggplot(data=plot.data, aes(x=class, y=value, fill=class, label=value)) + | |
geom_bar(stat='identity') + | |
geom_text(size=8) + | |
labs(title='Sensitivity by Newsgroup', y='Sensitivity', x='Newsgroup') | |
#Send output to png | |
png(file="myplot.png") #Opens a png 'device' | |
print(plot.ggplot) #Need explicit print() in batch mode for ggplot graph objects, interactive mode uses implicit print() | |
dev.off() | |
} | |
#------------------------------------------------------ | |
# 8: Extra Credit - Code profiling | |
# line.profiling only available in R 3.0.0+ | |
#------------------------------------------------------ | |
run.Rprof <- function() { | |
# Run RProf | |
Rprof(filename="rprof.out", memory.profiling=F, line.profiling=T) #Be sure to 'source' code for proper line counting | |
data <- create.dataset(dir.path) | |
Rprof(NULL) | |
summaryRprof(filename="rprof.out", lines="show") | |
plot(parse_rprof("rprof.out")) | |
} | |
#------------------------------------------------------ | |
# 9: Speedup klaR:::predict.NaiveBayes | |
#------------------------------------------------------ | |
fastpredict.NaiveBayes <- function (object, newdata, threshold = 0.001, ...) | |
{ | |
if (missing(newdata)) | |
newdata <- object$x | |
if ((!any(is.null(colnames(newdata)), is.null(object$varnames))) && | |
all(is.element(object$varnames, colnames(newdata)))) | |
newdata <- data.frame(newdata[, object$varnames]) | |
nattribs <- ncol(newdata) | |
isnumeric <- sapply(newdata, is.numeric) | |
newdata <- as.matrix(newdata) #Was using data.matrix which converts all factors to char and is slower | |
Lfoo <- function(i) { | |
tempfoo <- function(v) { | |
nd <- ndata[v] | |
if (is.na(nd)) | |
return(rep(1, length(object$apriori))) | |
prob <- if (isnumeric[v]) { | |
msd <- object$tables[[v]] | |
if (object$usekernel) | |
sapply(msd, FUN = function(y) dkernel(x = nd, | |
kernel = y, ...)) | |
else dnorm(nd, msd[, 1], msd[, 2]) | |
} | |
else { | |
object$tables[[v]][, nd] | |
} | |
prob[prob == 0] <- threshold | |
return(prob) | |
} | |
ndata <- newdata[i, ] | |
tempres <- log(sapply(1:nattribs, tempfoo)) | |
L <- log(object$apriori) + rowSums(tempres) | |
if (isTRUE(all.equal(sum(exp(L)), 0))) | |
warning("Numerical 0 probability for all classes with observation ", | |
i) | |
L | |
} | |
L <- sapply(1:nrow(newdata), Lfoo) | |
classdach <- factor(object$levels[apply(L, 2, which.max)], | |
levels = object$levels) | |
posterior <- t(apply(exp(L), 2, function(x) x/sum(x))) | |
colnames(posterior) <- object$levels | |
rownames(posterior) <- names(classdach) <- rownames(newdata) | |
return(list(class = classdach, posterior = posterior)) | |
} | |
#------------------------------------------------------ | |
# 10: A simple data set | |
#------------------------------------------------------ | |
create.sample.data <- function() { | |
sample.data <- rbind(list(.file=1, .class='Run', .text='The boy ran fast.', boy=T, girl=F, fast=T, slow=F, ran=T, dog=F, want=F, love=F), | |
list(.file=2, .class='Run', .text='The girl ran slow.', boy=F, girl=T, fast=F, slow=T, ran=T, dog=F, want=F, love=F), | |
list(.file=3, .class='Run', .text='He is fast.', boy=F, girl=F, fast=T, slow=F, ran=F, dog=F, want=F, love=F), | |
list(.file=4, .class='Pet', .text='I love my dog.', boy=F, girl=F, fast=T, slow=F, ran=F, dog=T, want=F, love=T), | |
list(.file=5, .class='Pet', .text='I want a dog.', boy=F, girl=F, fast=T, slow=F, ran=F, dog=T, want=T, love=F), | |
list(.file=6, .class='Pet', .text='My dog is slow.', boy=F, girl=F, fast=F, slow=T, ran=F, dog=T, want=F, love=F) | |
) | |
sample.data | |
} | |
#------------------------------------------------------ | |
# 11: A handy function for showing environment object sizes | |
#------------------------------------------------------ | |
my.ls <- function(pos=1, sorted=F){ | |
.result <- sapply(ls(pos=pos, all.names=TRUE), | |
function(..x)object.size(eval(as.symbol(..x))) / 1048576.0) | |
if (sorted){ | |
.result <- rev(sort(.result)) | |
} | |
.ls <- | |
as.data.frame(rbind(as.matrix(.result),"**Total"=sum(.result))) | |
names(.ls) <- "Size (MB)" | |
.ls$Size <- formatC(.ls$Size, big.mark=',', digits=0, format='f') | |
.ls$Mode <- c(unlist(lapply(rownames(.ls)[-nrow(.ls)], | |
function(x)mode(eval(as.symbol(x))))), '-------') | |
.ls[.ls$Mode != 'function',] #Exclude functions | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment