Last active
December 25, 2015 16:39
-
-
Save xccds/7007686 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# 原始文件读入 | |
txt <- readLines('txtdm.txt') | |
ignore = ",|:|!|'" | |
stopwords = c('and','edition','for','in','little','of','the','to') | |
txt <- tolower(txt) | |
# 文档分词 | |
doc <- strsplit(txt,' ') | |
# 去除常用词和标点 | |
doc <- lapply(doc,function(x)gsub(ignore,'',x)) | |
doc <- lapply(doc,function(x){ | |
x[!(x %in% stopwords)] | |
}) | |
# 取词项集合 | |
words <- unique(unlist(doc)) | |
# 计算词项文档矩阵 | |
DTM <- function(x,y){ | |
n <- length(x) | |
m <- length(y) | |
t <- matrix(nrow=n,ncol=m) | |
for (i in 1:n){ | |
for (j in 1:m){ | |
t[i,j] <- sum(doc[[j]]==words[i]) | |
} | |
} | |
return(t) | |
} | |
t <- DTM(words,doc) | |
# 只取同时出现在两个以上文档中的词项 | |
DocsPerWord <- rowSums(t>0) | |
words <- words[DocsPerWord>1] | |
t <- DTM(words,doc) | |
# 将频数转为tfidf值 | |
TFIDF <- function(t){ | |
WordsPerDoc <- colSums(t) | |
DocsPerWord <- rowSums(t>0) | |
for (i in 1:nrow(t)){ | |
for (j in 1:ncol(t)){ | |
t[i,j] <- (t[i,j]/WordsPerDoc[j])*log(ncol(t)/DocsPerWord[i]) | |
} | |
} | |
return(t) | |
} | |
tfidf <- TFIDF(t) | |
# SVD分解 | |
res <- svd(tfidf) | |
# 词项语义相关矩阵 | |
datau <- data.frame(res$u[,2:3]) | |
# 文档语义相关矩阵 | |
datav <- data.frame(res$v[,2:3]) | |
library(ggplot2) | |
p <- ggplot()+ | |
geom_point(data=datau,aes(X1,X2))+ | |
geom_point(data=datav,aes(X1,X2), | |
size=3,color='red4')+ | |
geom_text(data=datau,aes(X1,X2), | |
label=words,vjust=2)+ | |
geom_text(data=datav,aes(X1,X2), | |
label=1:9,vjust=2)+ | |
theme_bw() | |
print(p) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment