Skip to content

Instantly share code, notes, and snippets.

@wallingTACC
Last active December 24, 2015 08:49
Show Gist options
  • Save wallingTACC/6773447 to your computer and use it in GitHub Desktop.
Save wallingTACC/6773447 to your computer and use it in GitHub Desktop.
Austin R Users Group Meetup - FasteR Naive Bayes
library(tm) #Text Mining
library(klaR) #Naive Bayes
library(parallel) #RCore parallel functions
library(data.table) #Improved version of data.frame with optimized helper methods like fread
library(caret) #Help us analyze the results
library(ggplot2) #Grammer for Graphics, very popular package
library(profr)
#-----
# Data: Download 20 newsgroup dataset here: http://www.cs.cmu.edu/afs/cs.cmu.edu/project/theo-20/www/data/news20.html
#----
#------------------------------------------------------
# 1: Piece everything together
#------------------------------------------------------
run.nBayes <- function(dir.path) {
set.seed(23452) #Repeatable randomness
options(mc.cores=4) #Sets value globally, important for 3rd party packages
#Create initial dataset
all.data <- create.dataset.parallel(dir.path)
#Split into training vs test
training.indexes <- sample(nrow(all.data), nrow(all.data)*0.9) #Randomly select docs for our training set
training.data <- all.data[training.indexes,] #Subset all columns for matching rows
test.data <- all.data[-training.indexes,] # '-' gives non-matches
#Create TDM from just training data
training.tdm <- get.tdm(training.data$.text)
#Based on TDM, create doc 'term' vars
training.data <- create.doc.vars.parallel(training.data, training.tdm)
test.data <- create.doc.vars.parallel(test.data, training.tdm)
#Train our model
trained.model <- train.klaR.nBayes(training.data)
#Test & Analyze
predictions <- classify.klaR.nBayes.parallel(test.data, trained.model)
predicted <- predictions$class
actual <- test.data$.class
return(list(predicted=predicted, actual=actual))
}
#------------------------------------------------------
# 2: Process a given document to extract information
#------------------------------------------------------
get.post.words <- function(file.path) {
#Returns all the words in the 'subject' and 'body' of a newsgroup post located at file.path and extracts the 'class'
#Extract newsgroup 'class' by taking next to last of the path when split by directory delimiter
#The subdir specifies the 'class', the files represent a single post
parts.list <- strsplit(file.path,"/") #Assumes using "/" delimiter in the path parameter!!!
newsgroup <- parts.list[[1]][length(parts.list[[1]])-1] #This shows how to index a list object
filename <- parts.list[[1]][length(parts.list[[1]])]
#Now get contents from the file
con <- file(file.path, open="rt", encoding="latin1") #Read in text mode
text.list <- readLines(con) #Now we have all the text, one line per row in the text list
close(con)
#Get the subject w/o the 'Subject:' part. This line will always start with 'Subject:'
subject <- sub("Subject:", "", grep("Subject:", text.list, value=T))
#Body is everything after the first blank line
blank.index <- min(which(nchar(text.list)==0)) #'which' returns the indices where the statement is TRUE
body.list <- text.list[blank.index+1:length(text.list)] #Example of subsetting a list
cleansed.body.list <- gsub("\n", "", body.list) #Remove newlines
cleansed.body.list <- cleansed.body.list[!is.na(cleansed.body.list)] #Remove NAs
#Get all the text as single variable
all.text <- paste(subject, cleansed.body.list, collapse=" ") #Collapse to a single 'string'. paste = concatenate
#Save the filename, class and text. filename is useful for debugging, consider dropping in large runs
result <- list(.file=filename, .class=newsgroup, .text=all.text) #A named 'list' is more efficient than a data.frame in many cases. Use '.' so we don't have columns with same name as we add 'term' columns later
return(result) #The return statement is not required, but makes code more readable
}
#------------------------------------------------------
# 3: Process directories
#------------------------------------------------------
create.dataset <- function(dir.path) {
all.files <- list.files(dir.path, recursive=T, full.names=T)
data.list <- lapply(all.files, function(i) get.post.words(i)) #Iterate over file paths in all.files, and process each file
result <- do.call(rbind.data.frame, data.list) #Turn the list of lists into a data.frame
result$.text <- as.character(result$.text) #data.frame by default turns all char to factor
return(result)
}
create.dataset.parallel <- function(dir.path) {
#A parallelized implementation of the above
all.files <- list.files(dir.path, recursive=T, full.names=T)
data.list <- mclapply(all.files, function(i) get.post.words(i)) #mclapply from package 'parallel', assigns files to each available core
result <- rbindlist(data.list) #data.table package optimized implementation of lists -> data.table. data.table inherits from data.frame and provides optimizations
result$.class <- as.factor(result$.class) #rbindlist leaves char as char, but we want .class as.factor
return(result)
}
#------------------------------------------------------
# 4: Build TermDocumentMatrix for a given Corpus of text
#------------------------------------------------------
get.tdm <- function(doc.vec, minDocFreq=min(10, length(doc.vec)/10)) {
doc.corpus <- Corpus(VectorSource(doc.vec))
#The order of the options in the following list can be changed, but will cause different results. Each option is 'executed' in the order given
control <- list(tolower=TRUE,
removePunctuation=TRUE,
removeNumbers=TRUE,
stopwords=TRUE,
bounds=list(global=c(minDocFreq,Inf)), #Sets the min/max number of docs in which a given term occurs
weighting=weightTf #Other options are available, such as TD-IDF
)
#TermDocumentMatrix uses mclapply under the hood, yay!!!
doc.tdm <- TermDocumentMatrix(doc.corpus, control)
return(doc.tdm)
}
#------------------------------------------------------
# 5: Translate the TDMs for a given doc into column variables
#------------------------------------------------------
create.doc.vars.parallel <- function(data, training.tdm) {
#Returns a vector word occurance from the corpus based variables
#Create a seperate tdm for each of the docs
doc.tdm.list <- mclapply(data$.text, function(i) get.tdm(i, minDocFreq=1))
#Get the indexes of the matching terms, one result per doc
training.terms <- training.tdm$dimnames$Terms
#Matches are the indexes in doc.tdm.list terms that match the training.terms corpus
matches.list <- mclapply(doc.tdm.list, function(i) match(training.terms, i$dimnames$Terms))
#Create variables based on those matches
vars.list <- mclapply(1:length(matches.list), function(i) {
r <- sapply(1:length(matches.list[[i]]), function(j) { #sapply is lapply that is 'simplified' to a vector
val <- doc.tdm.list[[i]]$v[matches.list[[i]][j]]
ifelse(is.na(val), 'F', 'T')
})
return(r)
})
#Mmapply
# test.list <- mcmapply(FUN=function(i, j) { #sapply is lapply that is 'simplified' to a vector
# val <- i$v[j]
# ifelse(is.na(val), 'F', 'T')
# }, doc.tdm.list, matches.list)
#Only use lapply, task is small enough that the overhead of mclapply is not warranted
named.vars <- lapply(vars.list, function(i) setNames(i, training.tdm$dimnames$Terms))
named.vars.df <- data.frame(do.call(rbind, named.vars)) #Create a data.table which we can combine with the original. Note: can't use rbindlist as it expects a list of lists/data.frame/data.table, we have a character vector
result <- data.frame(c(data, named.vars.df))
return(result)
}
#------------------------------------------------------
# 6: Train and utilize our Naive Bayes model using klaR directly
#------------------------------------------------------
train.klaR.nBayes <- function(training.data) {
training.data.sub <- subset(training.data, select=-c(.file,.text)) #Remove the unused columns
trained.model <- NaiveBayes(formula=.class ~ .,
data=training.data.sub,
fL=0, #No laplace correction, we don't have numeric data
usekernel=F) #Use gaussian
return(trained.model)
}
classify.klaR.nBayes.parallel <- function(test.data, trained.model) {
test.data.sub <- subset(test.data, select=-c(.file,.class,.text)) #Remove unused values
result.list <- mclapply(1:nrow(test.data.sub), function(i) fastpredict.NaiveBayes(trained.model, test.data.sub[i,]))
#Reformat the results to match basic predict.NaiveBayes return which is a named list(class, posterior) with X=nrow(test.data) list items
classes <- sapply(result.list, function(i) i$class)
posteriors <- t(sapply(result.list, function(i) i$posterior))
result.formatted <- list(class=classes, posterior=posteriors)
rownames(result.formatted$posterior) <- 1:nrow(test.data)
colnames(result.formatted$posterior) <- colnames(result.list[[1]]$posterior)
return(result.formatted)
}
#------------------------------------------------------
# 7: Use caret to analyze results
#------------------------------------------------------
analyze.results <- function(predicted, actual) {
cm <- confusionMatrix(data=predicted, reference=actual) #Using package 'caret'
values <- cm$byClass[,1] #First column from all rows
classes <- sub('Class: ', '', rownames(cm$byClass))
par(mar=c(12,4,2,2)) #Extra large bottom margin
#This would go to Rplots.pdf in batch mode (ex. with Rscript)
#Use getOption("device") to see default device for the given environment (normally windows() or X11())
barplot(height=values,
names.arg=classes,
las=2,
ylab='Sensitivity',
main='Sensitivity by Newsgroup',
col=rainbow(length(classes))
)
#A ggplot example
#Very popular package, more control over how graphics are built up, good 'gallery' and documentation
plot.data <- data.frame(value=values, class=classes)
plot.ggplot <- ggplot(data=plot.data, aes(x=class, y=value, fill=class, label=value)) +
geom_bar(stat='identity') +
geom_text(size=8) +
labs(title='Sensitivity by Newsgroup', y='Sensitivity', x='Newsgroup')
#Send output to png
png(file="myplot.png") #Opens a png 'device'
print(plot.ggplot) #Need explicit print() in batch mode for ggplot graph objects, interactive mode uses implicit print()
dev.off()
}
#------------------------------------------------------
# 8: Extra Credit - Code profiling
# line.profiling only available in R 3.0.0+
#------------------------------------------------------
run.Rprof <- function() {
# Run RProf
Rprof(filename="rprof.out", memory.profiling=F, line.profiling=T) #Be sure to 'source' code for proper line counting
data <- create.dataset(dir.path)
Rprof(NULL)
summaryRprof(filename="rprof.out", lines="show")
plot(parse_rprof("rprof.out"))
}
#------------------------------------------------------
# 9: Speedup klaR:::predict.NaiveBayes
#------------------------------------------------------
fastpredict.NaiveBayes <- function (object, newdata, threshold = 0.001, ...)
{
if (missing(newdata))
newdata <- object$x
if ((!any(is.null(colnames(newdata)), is.null(object$varnames))) &&
all(is.element(object$varnames, colnames(newdata))))
newdata <- data.frame(newdata[, object$varnames])
nattribs <- ncol(newdata)
isnumeric <- sapply(newdata, is.numeric)
newdata <- as.matrix(newdata) #Was using data.matrix which converts all factors to char and is slower
Lfoo <- function(i) {
tempfoo <- function(v) {
nd <- ndata[v]
if (is.na(nd))
return(rep(1, length(object$apriori)))
prob <- if (isnumeric[v]) {
msd <- object$tables[[v]]
if (object$usekernel)
sapply(msd, FUN = function(y) dkernel(x = nd,
kernel = y, ...))
else dnorm(nd, msd[, 1], msd[, 2])
}
else {
object$tables[[v]][, nd]
}
prob[prob == 0] <- threshold
return(prob)
}
ndata <- newdata[i, ]
tempres <- log(sapply(1:nattribs, tempfoo))
L <- log(object$apriori) + rowSums(tempres)
if (isTRUE(all.equal(sum(exp(L)), 0)))
warning("Numerical 0 probability for all classes with observation ",
i)
L
}
L <- sapply(1:nrow(newdata), Lfoo)
classdach <- factor(object$levels[apply(L, 2, which.max)],
levels = object$levels)
posterior <- t(apply(exp(L), 2, function(x) x/sum(x)))
colnames(posterior) <- object$levels
rownames(posterior) <- names(classdach) <- rownames(newdata)
return(list(class = classdach, posterior = posterior))
}
#------------------------------------------------------
# 10: A simple data set
#------------------------------------------------------
create.sample.data <- function() {
sample.data <- rbind(list(.file=1, .class='Run', .text='The boy ran fast.', boy=T, girl=F, fast=T, slow=F, ran=T, dog=F, want=F, love=F),
list(.file=2, .class='Run', .text='The girl ran slow.', boy=F, girl=T, fast=F, slow=T, ran=T, dog=F, want=F, love=F),
list(.file=3, .class='Run', .text='He is fast.', boy=F, girl=F, fast=T, slow=F, ran=F, dog=F, want=F, love=F),
list(.file=4, .class='Pet', .text='I love my dog.', boy=F, girl=F, fast=T, slow=F, ran=F, dog=T, want=F, love=T),
list(.file=5, .class='Pet', .text='I want a dog.', boy=F, girl=F, fast=T, slow=F, ran=F, dog=T, want=T, love=F),
list(.file=6, .class='Pet', .text='My dog is slow.', boy=F, girl=F, fast=F, slow=T, ran=F, dog=T, want=F, love=F)
)
sample.data
}
#------------------------------------------------------
# 11: A handy function for showing environment object sizes
#------------------------------------------------------
my.ls <- function(pos=1, sorted=F){
.result <- sapply(ls(pos=pos, all.names=TRUE),
function(..x)object.size(eval(as.symbol(..x))) / 1048576.0)
if (sorted){
.result <- rev(sort(.result))
}
.ls <-
as.data.frame(rbind(as.matrix(.result),"**Total"=sum(.result)))
names(.ls) <- "Size (MB)"
.ls$Size <- formatC(.ls$Size, big.mark=',', digits=0, format='f')
.ls$Mode <- c(unlist(lapply(rownames(.ls)[-nrow(.ls)],
function(x)mode(eval(as.symbol(x))))), '-------')
.ls[.ls$Mode != 'function',] #Exclude functions
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment