Skip to content

Instantly share code, notes, and snippets.

@xccds
Last active December 25, 2015 16:39
Show Gist options
  • Save xccds/7007686 to your computer and use it in GitHub Desktop.
Save xccds/7007686 to your computer and use it in GitHub Desktop.
# 原始文件读入
txt <- readLines('txtdm.txt')
ignore = ",|:|!|'"
stopwords = c('and','edition','for','in','little','of','the','to')
txt <- tolower(txt)
# 文档分词
doc <- strsplit(txt,' ')
# 去除常用词和标点
doc <- lapply(doc,function(x)gsub(ignore,'',x))
doc <- lapply(doc,function(x){
x[!(x %in% stopwords)]
})
# 取词项集合
words <- unique(unlist(doc))
# 计算词项文档矩阵
DTM <- function(x,y){
n <- length(x)
m <- length(y)
t <- matrix(nrow=n,ncol=m)
for (i in 1:n){
for (j in 1:m){
t[i,j] <- sum(doc[[j]]==words[i])
}
}
return(t)
}
t <- DTM(words,doc)
# 只取同时出现在两个以上文档中的词项
DocsPerWord <- rowSums(t>0)
words <- words[DocsPerWord>1]
t <- DTM(words,doc)
# 将频数转为tfidf值
TFIDF <- function(t){
WordsPerDoc <- colSums(t)
DocsPerWord <- rowSums(t>0)
for (i in 1:nrow(t)){
for (j in 1:ncol(t)){
t[i,j] <- (t[i,j]/WordsPerDoc[j])*log(ncol(t)/DocsPerWord[i])
}
}
return(t)
}
tfidf <- TFIDF(t)
# SVD分解
res <- svd(tfidf)
# 词项语义相关矩阵
datau <- data.frame(res$u[,2:3])
# 文档语义相关矩阵
datav <- data.frame(res$v[,2:3])
library(ggplot2)
p <- ggplot()+
geom_point(data=datau,aes(X1,X2))+
geom_point(data=datav,aes(X1,X2),
size=3,color='red4')+
geom_text(data=datau,aes(X1,X2),
label=words,vjust=2)+
geom_text(data=datav,aes(X1,X2),
label=1:9,vjust=2)+
theme_bw()
print(p)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment