Last active
August 29, 2015 14:02
-
-
Save not-for-me/133a67b178c8840f5873 to your computer and use it in GitHub Desktop.
Document Classification with selected Term
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Library Load | |
library(tm) | |
library(party) | |
library(rpart) | |
library(nnet) | |
library(randomForest) | |
# Current R environment session Info | |
sessionInfo() | |
# Set file Paths | |
trainFilePath <- "~/Documents/mining/train_txt" | |
testFilePath <- "~/Documents/mining/test_txt" | |
# Import txt to TextCorpus | |
trainTextCorpus <- Corpus(DirSource(trainFilePath), readerControl = list(reader = readPlain, language = "en")) | |
trainTextCorpus | |
testTextCorpus <- Corpus(DirSource(testFilePath), readerControl = list(reader = readPlain, language = "en")) | |
testTextCorpus | |
# Convert Text Encoding to UTF-8 | |
trainTextCorpus <- tm_map(trainTextCorpus, function(x) iconv(x, to='UTF-8-MAC', sub='byte')) | |
testTextCorpus <- tm_map(testTextCorpus, function(x) iconv(x, to='UTF-8-MAC', sub='byte')) | |
# Text Preprocessing | |
trainTextCorpus <- tm_map(trainTextCorpus, tolower) | |
testTextCorpus <- tm_map(testTextCorpus, tolower) | |
trainTextCorpus <- tm_map(trainTextCorpus, removeWords, stopwords("english")) | |
testTextCorpus <- tm_map(testTextCorpus, removeWords, stopwords("english")) | |
trainTextCorpus <- tm_map(trainTextCorpus, removeNumbers) | |
testTextCorpus <- tm_map(testTextCorpus, removeNumbers) | |
trainTextCorpus <- tm_map(trainTextCorpus, removePunctuation) | |
testTextCorpus <- tm_map(testTextCorpus, removePunctuation) | |
trainTextCorpus <- tm_map(trainTextCorpus, stripWhitespace) | |
testTextCorpus <- tm_map(testTextCorpus, stripWhitespace) | |
trainTextCorpus <- tm_map(trainTextCorpus, stemDocument) | |
testTextCorpus <- tm_map(testTextCorpus, stemDocument) | |
# 단어길이 3-12로 이루어진 DTM 생성 | |
train_dtm <-DocumentTermMatrix(trainTextCorpus,control=list(wordLengths = c(3,12))) | |
dim(train_dtm) | |
test_dtm <-DocumentTermMatrix(testTextCorpus,control=list(wordLengths = c(3,12))) | |
dim(test_dtm) | |
# Remove Sparse Terms | |
train_stm <- removeSparseTerms(train_dtm, 0.8) | |
dim(train_stm) | |
test_stm <- removeSparseTerms(test_dtm, 0.8) | |
dim(test_stm) | |
# Convert stm to dataframe | |
train_df <- as.data.frame(inspect(train_stm)) | |
test_df <- as.data.frame(inspect(test_stm)) | |
# Add a document category | |
ncol(train_df) | |
train_category <- c(rep("BigData",101), rep("Cloud",97), rep("DB",102), rep("Multi", 100), rep("SDN", 99), rep("WSN",99)) | |
train_df <- cbind(train_df, train_category) | |
ncol(train_df) | |
ncol(test_df) | |
test_category <- c(rep("BigData",10), rep("DB", 9), rep("Multi", 10), rep("WSN", 9)) | |
test_df <- cbind(test_df, test_category) | |
ncol(test_df) | |
#---------------------------------------------------------------- | |
# Algorithm: Decision tree with Party Library | |
myFormula <- train_category ~ cloud + databas + network + sensor | |
party_tree <- ctree(myFormula, data=train_df) | |
party_tree | |
plot(party_tree) | |
plot(party_tree, type="simple") | |
train_result_table <- table(predict(party_tree),train_df$train_category) | |
train_result_table | |
# Train data Accuracy | |
sum(diag(train_result_table)) / sum(train_result_table) | |
test_result <- predict(party_tree, newdata=test_df) | |
test_result_table <- table(predict(party_tree, newdata=test_df), test_df$test_category) | |
test_result_table | |
# Test data Accuracy | |
sum(test_result_table[1,1], test_result_table[3,2], test_result_table[4,3], test_result_table[6,4]) / sum(test_result_table) | |
#---------------------------------------------------------------- | |
# Algorithm: Decision tree with RPart | |
dt <- rpart(myFormula, data=train_df, control=rpart.control(minsplit=10)) | |
attributes(dt) | |
print(dt) | |
plot(dt) | |
text(dt, use.n=TRUE) | |
train_result_table <- table(predict(dt, train_df, type="class"),train_category) | |
train_result_table | |
# Train data Accuracy | |
sum(diag(train_result_table)) / sum(train_result_table) | |
test_result_table <- table(predict(dt, test_df, type="class"), test_category) | |
test_result_table | |
# Test data Accuracy | |
sum(test_result_table[1,1], test_result_table[3,2], test_result_table[4,3], test_result_table[6,4]) / sum(test_result_table) | |
#---------------------------------------------------------------- | |
# Algorithm: Random Forest | |
rf <- randomForest(myFormula, data=train_df, ntree=100, proximity=TRUE) | |
print(rf) | |
train_result_table <- table(predict(rf), train_category) | |
train_result_table | |
# Train data Accuracy | |
sum(diag(train_result_table)) / sum(train_result_table) | |
plot(rf) | |
importance(rf) | |
varImpPlot(rf) | |
test_result_table <- table(predict(rf, test_df), test_category) | |
test_result_table | |
# Test data Accuracy | |
sum(test_result_table[1,1], test_result_table[3,2], test_result_table[4,3], test_result_table[6,4]) / sum(test_result_table) | |
plot(margin(rf, test_df$category)) | |
#---------------------------------------------------------------- | |
# Algorithm: NN | |
nnet.classifier <- nnet(myFormula, data=train_df, size=12, rang=0.1, decay=5e-5, maxit=170) | |
train_result_table <- table(predict(nnet.classifier, train_df, type="class"), train_category) | |
train_result_table | |
# Train data Accuracy | |
sum(diag(train_result_table)) / sum(train_result_table) | |
test_result_table <- table(predict(nnet.classifier, test_df, type="class"), test_category) | |
test_result_table | |
# Test data Accuracy | |
sum(test_result_table[1,1], test_result_table[3,2], test_result_table[4,3], test_result_table[6,4]) / sum(test_result_table) | |
# 기타 참고사항 | |
# StemWord 쓰려면 install.packages("SnowballC") 요거 | |
# Sys.setenv(LANG="EN") 영어로 에러보기 | |
# http://web.letras.up.pt/bhsmaia/EDV/apresentacoes/Bradzil_Classif_withTM.pdf |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment