Skip to content

Instantly share code, notes, and snippets.

@andrefs
Created December 11, 2017 01:16
Show Gist options
  • Save andrefs/7fc2c1665edf4ac0ac8fcb605d87fadb to your computer and use it in GitHub Desktop.
Save andrefs/7fc2c1665edf4ac0ac8fcb605d87fadb to your computer and use it in GitHub Desktop.
Pre-processing the documents
library(tm)
####################################
# load the corpora and pre process #
####################################
economia.train = VCorpus(DirSource('../news/train/economia'), readerControl = list(reader = readPlain, language='pt'))
desporto.train = VCorpus(DirSource('../news/train/desporto'), readerControl = list(reader = readPlain, language='pt'))
economia.test = VCorpus(DirSource('../news/eval/economia'), readerControl = list(reader = readPlain, language='pt'))
desporto.test = VCorpus(DirSource('../news/eval/desporto'), readerControl = list(reader = readPlain, language='pt'))
preprocess.simple <- function(d){
d <- tm_map(d, content_transformer(tolower))
d <- tm_map(d, removeWords, stopwords(kind = "pt"))
d <- tm_map(d, content_transformer(function(x) iconv(x, to="ASCII//TRANSLIT")))
d <- tm_map(d, removePunctuation, preserve_intra_word_dashes = TRUE)
d <- tm_map(d, removeNumbers)
d <- tm_map(d, content_transformer(function(x) stemDocument(x, language="pt")))
d <- tm_map(d, stripWhitespace)
return(d)
}
economia.train.p <- preprocess.simple(economia.train)
desporto.train.p <- preprocess.simple(desporto.train)
economia.test.p <- preprocess.simple(economia.test)
desporto.test.p <- preprocess.simple(desporto.test)
docs.train = c(economia.train.p, desporto.train.p)
docs.test = c(economia.test.p, desporto.test.p)
####################
# create data sets #
####################
dtm <- DocumentTermMatrix(docs.train, control = list(wordLengths = c(4, 15), bounds = list(global=c(10,Inf), local=c(2,Inf)), weighting = weightTfIdf))
train.d <- as.data.frame(as.matrix(removeSparseTerms(dtm, 0.95)))
train.c.vector <- c(rep("economia",length(economia.train)), rep("desporto",length(desporto.train)))
train.dc <- cbind(train.d, class=train.c.vector)
lexicon <- names(train.d)
train.dc <- train.dc[, c(which(information.gain(class~., train.dc)$attr_importance > 0), ncol(train.dc))]
test.d <- as.data.frame(as.matrix(DocumentTermMatrix(docs.test, control = list(dictionary = lexicon))))
test.c <- c(rep("economia",length(economia.test)), rep("desporto",length(desporto.test)))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment