Skip to content

Instantly share code, notes, and snippets.

@abarmat
Last active May 30, 2017 18:27
Show Gist options
  • Save abarmat/f9fef56bec36bca5dcc95f604f787475 to your computer and use it in GitHub Desktop.
Save abarmat/f9fef56bec36bca5dcc95f604f787475 to your computer and use it in GitHub Desktop.
DMUBA TP2 AA - Naive Bayes
library('e1071');
library('SparseM');
library('tm');
FILENAME <- 'tp2-work.csv'
# Randomizer
set.seed(100)
# Read file
data <- read.csv(FILENAME, header=TRUE, sep=";", stringsAsFactors=FALSE)
attr_list <- c('des', 'Clase')
df_all <- data[attr_list]
# Split train/test
n_train <- floor(0.8 * nrow(df_all))
sample_ix <- sample(seq_len(nrow(df_all)), size=n_train)
df_train <- df_all[sample_ix, ]
df_test <- df_all[-sample_ix, ]
vec_train <- as.vector(df_train$des);
vec_test <- as.vector(df_test$des);
src_train <- VectorSource(vec_train);
src_test <- VectorSource(vec_test);
corpus_train <- Corpus(src_train);
corpus_test <- Corpus(src_test);
corpus_train <- tm_map(corpus_train, stripWhitespace);
corpus_train <- tm_map(corpus_train, tolower);
corpus_train <- tm_map(corpus_train, removeWords, stopwords("spanish"));
corpus_train <- tm_map(corpus_train, removePunctuation);
corpus_train <- tm_map(corpus_train, PlainTextDocument);
corpus_test <- tm_map(corpus_test, stripWhitespace);
corpus_test <- tm_map(corpus_test, tolower);
corpus_test <- tm_map(corpus_test, removeWords, stopwords("spanish"));
corpus_test <- tm_map(corpus_test, removePunctuation);
corpus_test <- tm_map(corpus_test, PlainTextDocument);
matrix_train <- t(TermDocumentMatrix(corpus_train));
matrix_test <- t(TermDocumentMatrix(corpus_test));
model <- naiveBayes(as.matrix(matrix_train), as.factor(df_train$Clase));
results <- predict(model, as.matrix(matrix_test));
sum(as.integer(results == df_test$Clase))/dim(df_test)[1]
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment