yunho0130 · February 10, 2017 13:48
diff --git a/Topicmodeling.R b/Topicmodeling.R
 install.packages("tm")
 library(tm)

 #mobile
 news <- read.csv("mobile2014.csv",stringsAsFactors=F)
 news.corpus <- Corpus(VectorSource(news$x))

 news.corpus <- tm_map(news.corpus, stemDocument, language = "english") 
 tdm <- TermDocumentMatrix(news.corpus,
                         control = list(removeNumbers = T,
                                        removePunctuation = T,
                                        stopwords=stopwords("SMART"),
                                        weighting=weightTfIdf))
 dim(tdm)
 #install.packages("slam")
 library(slam)
 word.count <- as.array(rollup(tdm, 2))
 word.order <- order(word.count, decreasing = T)
 freq.word <- word.order[1:30]
 row.names(tdm[freq.word,])
 freq.word <- word.order[1:1000]

 # latent semantic analysis 
 install.packages("lsa")
 library(lsa)
 news.lsa <- lsa(tdm,30) # if it doesn't work, reduce the dimensions
 gc() # garbage collector 
 news.lsa <- lsa(tdm[freq.word,], 30) # 30 dimensions 
 news.lsa$tk[,1] # first dimension 
 for(i in 1:30){  # show relevant words of each dimension
  print(i)
  importance <- order(abs(news.lsa$tk[,i]), decreasing = T)
  print(news.lsa$tk[importance[1:10], i])
 }

 install.packages("GPArotation")
 library(GPArotation) # varimax rotation 
 tk <- Varimax(news.lsa$tk)$loadings
 for(i in 1:30){  # show relevant words of each dimension
  print(i)
  importance <- order(abs(tk[,i]), decreasing = T)
  print(tk[importance[1:10], i])
 }


 #mobile
 news <- read.csv("mobile2014.csv",stringsAsFactors=F)
 news.corpus <- Corpus(VectorSource(news$x))
 news.corpus <- tm_map(news.corpus, stemDocument, language = "english") 
 tdm <- TermDocumentMatrix(news.corpus,
                         control = list(removeNumbers = T,
                                        removePunctuation = T,
                                        wordLengths=c(3,Inf),
                                        stopwords=stopwords("SMART"),
                                        weighting=weightTfIdf))
 dim(tdm)
 #LDA
 # install.packages("topicmodels")
 # library(topicmodels)
 # install.packages("lda")
 # library(lda)
 # install.packages("SnowballC")
 # library(SnowballC)

 ldaform <- dtm2ldaformat(tdm, omit_empty = T)
 result.lda <- lda.collapsed.gibbs.sampler(documents = ldaform$documents,
                                         K = 15,   # number of topics
                                         vocab = ldaform$vocab,   # used words
                                         num.iterations = 5000,  # number of iterations
                                         burnin = 1000,  # number of past iterations to used
                                         alpha = 0.01, # amount of topics in a document if it is greater than 1, there are many topics in a document. If it is less than 0 there are less number of topics in a document. 
                                         eta = 0.01) # amount of words in a topic 

 result.lda$topics
 lw <- as.matrix(top.topic.words(result.lda$topics))
 result.lda$topic_sums
 result.lda$document_sums
 tdm
 for(j in 1:15){
  cat(j)
  cat(": ")
  for(i in 1:5){
  cat(row.names(tdm[as.numeric(lw[i,j]),]))
  cat(", ") # show words in Topic 1 
  }
  print(" ")
 }

 #daum review
 new.reviews <- read.csv("reviews.csv",stringsAsFactors=F)
 review.corpus <- Corpus(VectorSource(new.reviews$x))
 library(KoNLP)

 ko.words.noun <- function(doc){
  d <- as.character(doc)
  pos <- extractNoun(d)
 }

 options(mc.cores=1)
 tdm2 <- TermDocumentMatrix(review.corpus,control=list(tokenize=ko.words.noun,wordLengths=c(1,Inf),removePunctuation=T,removeNumbers=T))

 ldaform2 <- dtm2ldaformat(tdm2, omit_empty = T)
 result.lda2 <- lda.collapsed.gibbs.sampler(documents = ldaform2$documents,
                                         K = 3,   # number of topics
                                         vocab = ldaform2$vocab,   # used words
                                         num.iterations = 5000,  # number of iterations
                                         burnin = 1000,  # number of past iterations to used
                                         alpha = 0.01, # amount of topics in a document if it is greater than 1, there are many topics in a document. If it is less than 0 there are less number of topics in a document. 
                                         eta = 0.01) # amount of words in a topic 

 result.lda2$topics
 lw2 <- as.matrix(top.topic.words(result.lda2$topics))
 result.lda2$topic_sums
 result.lda2$document_sums

 for(i in 1:20){
  print(row.names(tdm2[as.numeric(lw2[i,3]),])) # show words in Topic 1 
 }
	install.packages("tm")
	library(tm)

	#mobile
	news <- read.csv("mobile2014.csv",stringsAsFactors=F)
	news.corpus <- Corpus(VectorSource(news$x))

	news.corpus <- tm_map(news.corpus, stemDocument, language = "english")
	tdm <- TermDocumentMatrix(news.corpus,
	control = list(removeNumbers = T,
	removePunctuation = T,
	stopwords=stopwords("SMART"),
	weighting=weightTfIdf))
	dim(tdm)
	#install.packages("slam")
	library(slam)
	word.count <- as.array(rollup(tdm, 2))
	word.order <- order(word.count, decreasing = T)
	freq.word <- word.order[1:30]
	row.names(tdm[freq.word,])
	freq.word <- word.order[1:1000]

	# latent semantic analysis
	install.packages("lsa")
	library(lsa)
	news.lsa <- lsa(tdm,30) # if it doesn't work, reduce the dimensions
	gc() # garbage collector
	news.lsa <- lsa(tdm[freq.word,], 30) # 30 dimensions
	news.lsa$tk[,1] # first dimension
	for(i in 1:30){ # show relevant words of each dimension
	print(i)
	importance <- order(abs(news.lsa$tk[,i]), decreasing = T)
	print(news.lsa$tk[importance[1:10], i])
	}

	install.packages("GPArotation")
	library(GPArotation) # varimax rotation
	tk <- Varimax(news.lsa$tk)$loadings
	for(i in 1:30){ # show relevant words of each dimension
	print(i)
	importance <- order(abs(tk[,i]), decreasing = T)
	print(tk[importance[1:10], i])
	}


	#mobile
	news <- read.csv("mobile2014.csv",stringsAsFactors=F)
	news.corpus <- Corpus(VectorSource(news$x))
	news.corpus <- tm_map(news.corpus, stemDocument, language = "english")
	tdm <- TermDocumentMatrix(news.corpus,
	control = list(removeNumbers = T,
	removePunctuation = T,
	wordLengths=c(3,Inf),
	stopwords=stopwords("SMART"),
	weighting=weightTfIdf))
	dim(tdm)
	#LDA
	# install.packages("topicmodels")
	# library(topicmodels)
	# install.packages("lda")
	# library(lda)
	# install.packages("SnowballC")
	# library(SnowballC)

	ldaform <- dtm2ldaformat(tdm, omit_empty = T)
	result.lda <- lda.collapsed.gibbs.sampler(documents = ldaform$documents,
	K = 15, # number of topics
	vocab = ldaform$vocab, # used words
	num.iterations = 5000, # number of iterations
	burnin = 1000, # number of past iterations to used
	alpha = 0.01, # amount of topics in a document if it is greater than 1, there are many topics in a document. If it is less than 0 there are less number of topics in a document.
	eta = 0.01) # amount of words in a topic

	result.lda$topics
	lw <- as.matrix(top.topic.words(result.lda$topics))
	result.lda$topic_sums
	result.lda$document_sums
	tdm
	for(j in 1:15){
	cat(j)
	cat(": ")
	for(i in 1:5){
	cat(row.names(tdm[as.numeric(lw[i,j]),]))
	cat(", ") # show words in Topic 1
	}
	print(" ")
	}

	#daum review
	new.reviews <- read.csv("reviews.csv",stringsAsFactors=F)
	review.corpus <- Corpus(VectorSource(new.reviews$x))
	library(KoNLP)

	ko.words.noun <- function(doc){
	d <- as.character(doc)
	pos <- extractNoun(d)
	}

	options(mc.cores=1)
	tdm2 <- TermDocumentMatrix(review.corpus,control=list(tokenize=ko.words.noun,wordLengths=c(1,Inf),removePunctuation=T,removeNumbers=T))

	ldaform2 <- dtm2ldaformat(tdm2, omit_empty = T)
	result.lda2 <- lda.collapsed.gibbs.sampler(documents = ldaform2$documents,
	K = 3, # number of topics
	vocab = ldaform2$vocab, # used words
	num.iterations = 5000, # number of iterations
	burnin = 1000, # number of past iterations to used
	alpha = 0.01, # amount of topics in a document if it is greater than 1, there are many topics in a document. If it is less than 0 there are less number of topics in a document.
	eta = 0.01) # amount of words in a topic

	result.lda2$topics
	lw2 <- as.matrix(top.topic.words(result.lda2$topics))
	result.lda2$topic_sums
	result.lda2$document_sums

	for(i in 1:20){
	print(row.names(tdm2[as.numeric(lw2[i,3]),])) # show words in Topic 1
	}