Skip to content

Instantly share code, notes, and snippets.

@not-for-me
Last active August 29, 2015 14:02
Show Gist options
  • Save not-for-me/133a67b178c8840f5873 to your computer and use it in GitHub Desktop.
Save not-for-me/133a67b178c8840f5873 to your computer and use it in GitHub Desktop.
Document Classification with selected Term
# Library Load
library(tm)
library(party)
library(rpart)
library(nnet)
library(randomForest)
# Current R environment session Info
sessionInfo()
# Set file Paths
trainFilePath <- "~/Documents/mining/train_txt"
testFilePath <- "~/Documents/mining/test_txt"
# Import txt to TextCorpus
trainTextCorpus <- Corpus(DirSource(trainFilePath), readerControl = list(reader = readPlain, language = "en"))
trainTextCorpus
testTextCorpus <- Corpus(DirSource(testFilePath), readerControl = list(reader = readPlain, language = "en"))
testTextCorpus
# Convert Text Encoding to UTF-8
trainTextCorpus <- tm_map(trainTextCorpus, function(x) iconv(x, to='UTF-8-MAC', sub='byte'))
testTextCorpus <- tm_map(testTextCorpus, function(x) iconv(x, to='UTF-8-MAC', sub='byte'))
# Text Preprocessing
trainTextCorpus <- tm_map(trainTextCorpus, tolower)
testTextCorpus <- tm_map(testTextCorpus, tolower)
trainTextCorpus <- tm_map(trainTextCorpus, removeWords, stopwords("english"))
testTextCorpus <- tm_map(testTextCorpus, removeWords, stopwords("english"))
trainTextCorpus <- tm_map(trainTextCorpus, removeNumbers)
testTextCorpus <- tm_map(testTextCorpus, removeNumbers)
trainTextCorpus <- tm_map(trainTextCorpus, removePunctuation)
testTextCorpus <- tm_map(testTextCorpus, removePunctuation)
trainTextCorpus <- tm_map(trainTextCorpus, stripWhitespace)
testTextCorpus <- tm_map(testTextCorpus, stripWhitespace)
trainTextCorpus <- tm_map(trainTextCorpus, stemDocument)
testTextCorpus <- tm_map(testTextCorpus, stemDocument)
# 단어길이 3-12로 이루어진 DTM 생성
train_dtm <-DocumentTermMatrix(trainTextCorpus,control=list(wordLengths = c(3,12)))
dim(train_dtm)
test_dtm <-DocumentTermMatrix(testTextCorpus,control=list(wordLengths = c(3,12)))
dim(test_dtm)
# Remove Sparse Terms
train_stm <- removeSparseTerms(train_dtm, 0.8)
dim(train_stm)
test_stm <- removeSparseTerms(test_dtm, 0.8)
dim(test_stm)
# Convert stm to dataframe
train_df <- as.data.frame(inspect(train_stm))
test_df <- as.data.frame(inspect(test_stm))
# Add a document category
ncol(train_df)
train_category <- c(rep("BigData",101), rep("Cloud",97), rep("DB",102), rep("Multi", 100), rep("SDN", 99), rep("WSN",99))
train_df <- cbind(train_df, train_category)
ncol(train_df)
ncol(test_df)
test_category <- c(rep("BigData",10), rep("DB", 9), rep("Multi", 10), rep("WSN", 9))
test_df <- cbind(test_df, test_category)
ncol(test_df)
#----------------------------------------------------------------
# Algorithm: Decision tree with Party Library
myFormula <- train_category ~ cloud + databas + network + sensor
party_tree <- ctree(myFormula, data=train_df)
party_tree
plot(party_tree)
plot(party_tree, type="simple")
train_result_table <- table(predict(party_tree),train_df$train_category)
train_result_table
# Train data Accuracy
sum(diag(train_result_table)) / sum(train_result_table)
test_result <- predict(party_tree, newdata=test_df)
test_result_table <- table(predict(party_tree, newdata=test_df), test_df$test_category)
test_result_table
# Test data Accuracy
sum(test_result_table[1,1], test_result_table[3,2], test_result_table[4,3], test_result_table[6,4]) / sum(test_result_table)
#----------------------------------------------------------------
# Algorithm: Decision tree with RPart
dt <- rpart(myFormula, data=train_df, control=rpart.control(minsplit=10))
attributes(dt)
print(dt)
plot(dt)
text(dt, use.n=TRUE)
train_result_table <- table(predict(dt, train_df, type="class"),train_category)
train_result_table
# Train data Accuracy
sum(diag(train_result_table)) / sum(train_result_table)
test_result_table <- table(predict(dt, test_df, type="class"), test_category)
test_result_table
# Test data Accuracy
sum(test_result_table[1,1], test_result_table[3,2], test_result_table[4,3], test_result_table[6,4]) / sum(test_result_table)
#----------------------------------------------------------------
# Algorithm: Random Forest
rf <- randomForest(myFormula, data=train_df, ntree=100, proximity=TRUE)
print(rf)
train_result_table <- table(predict(rf), train_category)
train_result_table
# Train data Accuracy
sum(diag(train_result_table)) / sum(train_result_table)
plot(rf)
importance(rf)
varImpPlot(rf)
test_result_table <- table(predict(rf, test_df), test_category)
test_result_table
# Test data Accuracy
sum(test_result_table[1,1], test_result_table[3,2], test_result_table[4,3], test_result_table[6,4]) / sum(test_result_table)
plot(margin(rf, test_df$category))
#----------------------------------------------------------------
# Algorithm: NN
nnet.classifier <- nnet(myFormula, data=train_df, size=12, rang=0.1, decay=5e-5, maxit=170)
train_result_table <- table(predict(nnet.classifier, train_df, type="class"), train_category)
train_result_table
# Train data Accuracy
sum(diag(train_result_table)) / sum(train_result_table)
test_result_table <- table(predict(nnet.classifier, test_df, type="class"), test_category)
test_result_table
# Test data Accuracy
sum(test_result_table[1,1], test_result_table[3,2], test_result_table[4,3], test_result_table[6,4]) / sum(test_result_table)
# 기타 참고사항
# StemWord 쓰려면 install.packages("SnowballC") 요거
# Sys.setenv(LANG="EN") 영어로 에러보기
# http://web.letras.up.pt/bhsmaia/EDV/apresentacoes/Bradzil_Classif_withTM.pdf
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment