abarmat · May 30, 2017 18:27
diff --git a/tp2_aa_bayes.R b/tp2_aa_bayes.R
 library('e1071');
 library('SparseM');
 library('tm');

 FILENAME <- 'tp2-work.csv'

 # Randomizer
 set.seed(100)

 # Read file
 data <- read.csv(FILENAME, header=TRUE, sep=";", stringsAsFactors=FALSE)

 attr_list <- c('des', 'Clase')
 df_all <- data[attr_list]

 # Split train/test
 n_train <- floor(0.8 * nrow(df_all))
 sample_ix <- sample(seq_len(nrow(df_all)), size=n_train)

 df_train <- df_all[sample_ix, ]
 df_test <- df_all[-sample_ix, ]

 vec_train <- as.vector(df_train$des);
 vec_test <- as.vector(df_test$des);

 src_train <- VectorSource(vec_train);
 src_test <- VectorSource(vec_test);
 corpus_train <- Corpus(src_train);
 corpus_test <- Corpus(src_test);

 corpus_train <- tm_map(corpus_train, stripWhitespace);
 corpus_train <- tm_map(corpus_train, tolower);
 corpus_train <- tm_map(corpus_train, removeWords, stopwords("spanish"));
 corpus_train <- tm_map(corpus_train, removePunctuation);
 corpus_train <- tm_map(corpus_train, PlainTextDocument);

 corpus_test <- tm_map(corpus_test, stripWhitespace);
 corpus_test <- tm_map(corpus_test, tolower);
 corpus_test <- tm_map(corpus_test, removeWords, stopwords("spanish"));
 corpus_test <- tm_map(corpus_test, removePunctuation);
 corpus_test <- tm_map(corpus_test, PlainTextDocument);

 matrix_train <- t(TermDocumentMatrix(corpus_train));
 matrix_test <- t(TermDocumentMatrix(corpus_test));

 model <- naiveBayes(as.matrix(matrix_train), as.factor(df_train$Clase));
 results <- predict(model, as.matrix(matrix_test));
 sum(as.integer(results == df_test$Clase))/dim(df_test)[1]
	library('e1071');
	library('SparseM');
	library('tm');

	FILENAME <- 'tp2-work.csv'

	# Randomizer
	set.seed(100)

	# Read file
	data <- read.csv(FILENAME, header=TRUE, sep=";", stringsAsFactors=FALSE)

	attr_list <- c('des', 'Clase')
	df_all <- data[attr_list]

	# Split train/test
	n_train <- floor(0.8 * nrow(df_all))
	sample_ix <- sample(seq_len(nrow(df_all)), size=n_train)

	df_train <- df_all[sample_ix, ]
	df_test <- df_all[-sample_ix, ]

	vec_train <- as.vector(df_train$des);
	vec_test <- as.vector(df_test$des);

	src_train <- VectorSource(vec_train);
	src_test <- VectorSource(vec_test);
	corpus_train <- Corpus(src_train);
	corpus_test <- Corpus(src_test);

	corpus_train <- tm_map(corpus_train, stripWhitespace);
	corpus_train <- tm_map(corpus_train, tolower);
	corpus_train <- tm_map(corpus_train, removeWords, stopwords("spanish"));
	corpus_train <- tm_map(corpus_train, removePunctuation);
	corpus_train <- tm_map(corpus_train, PlainTextDocument);

	corpus_test <- tm_map(corpus_test, stripWhitespace);
	corpus_test <- tm_map(corpus_test, tolower);
	corpus_test <- tm_map(corpus_test, removeWords, stopwords("spanish"));
	corpus_test <- tm_map(corpus_test, removePunctuation);
	corpus_test <- tm_map(corpus_test, PlainTextDocument);

	matrix_train <- t(TermDocumentMatrix(corpus_train));
	matrix_test <- t(TermDocumentMatrix(corpus_test));

	model <- naiveBayes(as.matrix(matrix_train), as.factor(df_train$Clase));
	results <- predict(model, as.matrix(matrix_test));
	sum(as.integer(results == df_test$Clase))/dim(df_test)[1]