test
Last active
May 7, 2017 04:59
-
-
Save wush978/b868b07fdeba8bf38bf8 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
install.packages("rJava") | |
install.packages("Rwordseg", repos="http://R-Forge.R-project.org") | |
install.packages("tm") | |
install.packages("tmcn", repos="http://R-Forge.R-project.org", type="source") | |
install.packages("wordcloud") | |
install.packages("XML") | |
install.packages("RCurl") | |
library(tm) | |
setwd("C:/Users/yylee/Desktop/") | |
rm(d.corpus) | |
d.corpus <- Corpus(DirSource("b"), list(language = NA)) | |
#清除標點符號# | |
d.corpus <- tm_map(d.corpus, removePunctuation) | |
#清除數字# | |
d.corpus <- tm_map(d.corpus, removeNumbers) | |
#清除大小寫英文與數字# | |
d.corpus <- tm_map(d.corpus, function(word) { | |
gsub("[A-Za-z0-9]", "", word) | |
}) | |
#載入資料庫,協助斷詞# | |
strwords <- c("部落格", "獨立評論", "讀者投書") | |
insertWords(strwords, strtype=rep("n", length(strwords)), numfreq=rep(1000, length(strwords))) | |
#利用rJava去連結java分詞工具ansj來進行斷詞。 | |
#另外,斷詞後的詞彙有詞性,例如動詞、名詞、形容詞、介係詞等等,我們只挑出名詞來進行分析。 | |
d.corpus <- tm_map(d.corpus[1:1002], segmentCN, nature = TRUE) | |
d.corpus <- tm_map(d.corpus, function(sentence) { | |
noun <- lapply(sentence, function(w) { | |
w[names(w) == "n"] | |
}) | |
unlist(noun) | |
}) | |
d.corpus <- Corpus(VectorSource(d.corpus)) | |
#建立 text matrix# | |
d.corpus <- tm_map(d.corpus, PlainTextDocument) | |
tdm <- TermDocumentMatrix(d.corpus, control =list(wordLengths = c(2, Inf))) | |
#我們可以看看TermDocumentMatrix裡面,前兩篇文章的前10個關鍵字 | |
inspect(tdm[1:10, 1:1002]) | |
#文字雲,要用3.0.2以上版本的R# | |
library(RColorBrewer) | |
library(wordcloud) | |
m1 <- as.matrix(tdm) | |
v <- sort(rowSums(m1), decreasing = TRUE) | |
d <- data.frame(word = names(v), freq = v) | |
wordcloud(d$word, d$freq, min.freq =3, random.order = F, ordered.colors = F, | |
colors = rainbow(length(row.names(m1)))) | |
#趴兔 | |
#增加停用字符# | |
myStopWords <- c(stopwords(), "訂閱","電子","會員","雜誌","報","天下","通知","人","月","訂戶","信","名單","人員","人") | |
d.corpus <- tm_map(d.corpus, removeWords, myStopWords) | |
#檢查停用字符# | |
head(myStopWords, 20) | |
head(removeWords) | |
#建立 text matrix# | |
d.corpus <- tm_map(d.corpus, PlainTextDocument) | |
tdm <- TermDocumentMatrix(d.corpus, control =list(wordLengths = c(2, Inf))) | |
#我們可以看看TermDocumentMatrix裡面,前兩篇文章的前10個關鍵字 | |
inspect(tdm[1:10, 1:1002]) | |
#文字雲,要用3.0.2以上版本的R# | |
library(RColorBrewer) | |
library(wordcloud) | |
m1 <- as.matrix(tdm) | |
v <- sort(rowSums(m1), decreasing = TRUE) | |
d <- data.frame(word = names(v), freq = v) | |
wordcloud(d$word, d$freq, min.freq =10, random.order = F, ordered.colors = F, | |
colors = rainbow(length(row.names(m1)))) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment