Skip to content

Instantly share code, notes, and snippets.

@wush978
Last active May 7, 2017 04:59
Show Gist options
  • Save wush978/b868b07fdeba8bf38bf8 to your computer and use it in GitHub Desktop.
Save wush978/b868b07fdeba8bf38bf8 to your computer and use it in GitHub Desktop.
install.packages("rJava")
install.packages("Rwordseg", repos="http://R-Forge.R-project.org")
install.packages("tm")
install.packages("tmcn", repos="http://R-Forge.R-project.org", type="source")
install.packages("wordcloud")
install.packages("XML")
install.packages("RCurl")
library(tm)
setwd("C:/Users/yylee/Desktop/")
rm(d.corpus)
d.corpus <- Corpus(DirSource("b"), list(language = NA))
#清除標點符號#
d.corpus <- tm_map(d.corpus, removePunctuation)
#清除數字#
d.corpus <- tm_map(d.corpus, removeNumbers)
#清除大小寫英文與數字#
d.corpus <- tm_map(d.corpus, function(word) {
gsub("[A-Za-z0-9]", "", word)
})
#載入資料庫,協助斷詞#
strwords <- c("部落格", "獨立評論", "讀者投書")
insertWords(strwords, strtype=rep("n", length(strwords)), numfreq=rep(1000, length(strwords)))
#利用rJava去連結java分詞工具ansj來進行斷詞。
#另外,斷詞後的詞彙有詞性,例如動詞、名詞、形容詞、介係詞等等,我們只挑出名詞來進行分析。
d.corpus <- tm_map(d.corpus[1:1002], segmentCN, nature = TRUE)
d.corpus <- tm_map(d.corpus, function(sentence) {
noun <- lapply(sentence, function(w) {
w[names(w) == "n"]
})
unlist(noun)
})
d.corpus <- Corpus(VectorSource(d.corpus))
#建立 text matrix#
d.corpus <- tm_map(d.corpus, PlainTextDocument)
tdm <- TermDocumentMatrix(d.corpus, control =list(wordLengths = c(2, Inf)))
#我們可以看看TermDocumentMatrix裡面,前兩篇文章的前10個關鍵字
inspect(tdm[1:10, 1:1002])
#文字雲,要用3.0.2以上版本的R#
library(RColorBrewer)
library(wordcloud)
m1 <- as.matrix(tdm)
v <- sort(rowSums(m1), decreasing = TRUE)
d <- data.frame(word = names(v), freq = v)
wordcloud(d$word, d$freq, min.freq =3, random.order = F, ordered.colors = F,
colors = rainbow(length(row.names(m1))))
#趴兔
#增加停用字符#
myStopWords <- c(stopwords(), "訂閱","電子","會員","雜誌","報","天下","通知","人","月","訂戶","信","名單","人員","人")
d.corpus <- tm_map(d.corpus, removeWords, myStopWords)
#檢查停用字符#
head(myStopWords, 20)
head(removeWords)
#建立 text matrix#
d.corpus <- tm_map(d.corpus, PlainTextDocument)
tdm <- TermDocumentMatrix(d.corpus, control =list(wordLengths = c(2, Inf)))
#我們可以看看TermDocumentMatrix裡面,前兩篇文章的前10個關鍵字
inspect(tdm[1:10, 1:1002])
#文字雲,要用3.0.2以上版本的R#
library(RColorBrewer)
library(wordcloud)
m1 <- as.matrix(tdm)
v <- sort(rowSums(m1), decreasing = TRUE)
d <- data.frame(word = names(v), freq = v)
wordcloud(d$word, d$freq, min.freq =10, random.order = F, ordered.colors = F,
colors = rainbow(length(row.names(m1))))

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment