paulusm · December 6, 2017 10:11
diff --git a/text-mining.R b/text-mining.R
 # Based on "Introduction to Text Mining with R for Information Professionals"
 # http://journal.code4lib.org/articles/11626

 # Topic Modelling using Tidy Text from
 # http://tidytextmining.com/topicmodeling.html

 # Interview data from
 # https://figshare.com/authors/Kevin_Sanders/601331 

 # Uncomment to install these if needed
 #install.packages("SnowballC")
 #install.packages("wordcloud")
 #install.packages("tm")
 #install.packages("RColorBrewer")
 #install.packages("topicmodels")
 #install.packages("tidytext")
 #install.packages("dplyr")
 #install.packages("ggplot2")

 library(tm)
 library(SnowballC)
 library(wordcloud)

 library(topicmodels)
 library(tidytext)
 library(dplyr)
 library(ggplot2)
 #library(RColorBrewer)


 # Load corpus of texts
 int_corpus <- Corpus(DirSource('./corpus'))

 # Clean and prepare texts
 int_corpus <- tm_map(int_corpus, stripWhitespace)
 int_corpus <- tm_map(int_corpus, content_transformer(tolower))
 int_corpus <- tm_map(int_corpus, removeWords, stopwords("english"))
 int_corpus <- tm_map(int_corpus, removeNumbers)
 int_corpus <- tm_map(int_corpus, removePunctuation)
 int_corpus <- tm_map(int_corpus, stemDocument)


 # Create matrix, trim & preview
 dtm <- DocumentTermMatrix(int_corpus)
 dtm <- removeSparseTerms(dtm, sparse=0.75)
 dtm.matrix<-as.matrix(dtm)
 head(dtm.matrix)

 #weight by tf/idf
 dtm_weighted <- weightTfIdf(dtm, normalize = TRUE)
 head(as.matrix(dtm_weighted))

 # Frequent terms
 freq <- sort(colSums(as.matrix(dtm)), decreasing=TRUE)
 head(freq)


 # Wordcloud
 wordcloud::wordcloud(names(freq), freq, min.freq=50, max.words=Inf, random.order=FALSE, 
          colors=brewer.pal(8, "Accent"), scale=c(7,.4), rot.per=0)

 # Find correlated terms
 findAssocs(dtm,term="impact",0.9)

 # Document correlations
 d <- dist(as.matrix(dtm))
 hc <- hclust(d)
 plot(hc)

 # topic model
 # set a seed so that the output of the model is predictable
 int_lda <- LDA(dtm, k = 3, control = list(seed = 1234))
 int_lda

 int_topics <- tidy(int_lda, matrix = "beta")
 head(int_topics)

 int_top_terms <- int_topics %>%
  group_by(topic) %>%
  top_n(10, beta) %>%
  ungroup() %>%
  arrange(topic, -beta)

 int_top_terms %>%
  mutate(term = reorder(term, beta)) %>%
  ggplot(aes(term, beta, fill = factor(topic))) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~ topic, scales = "free") +
  coord_flip()
	# Based on "Introduction to Text Mining with R for Information Professionals"
	# http://journal.code4lib.org/articles/11626

	# Topic Modelling using Tidy Text from
	# http://tidytextmining.com/topicmodeling.html

	# Interview data from
	# https://figshare.com/authors/Kevin_Sanders/601331

	# Uncomment to install these if needed
	#install.packages("SnowballC")
	#install.packages("wordcloud")
	#install.packages("tm")
	#install.packages("RColorBrewer")
	#install.packages("topicmodels")
	#install.packages("tidytext")
	#install.packages("dplyr")
	#install.packages("ggplot2")

	library(tm)
	library(SnowballC)
	library(wordcloud)

	library(topicmodels)
	library(tidytext)
	library(dplyr)
	library(ggplot2)
	#library(RColorBrewer)


	# Load corpus of texts
	int_corpus <- Corpus(DirSource('./corpus'))

	# Clean and prepare texts
	int_corpus <- tm_map(int_corpus, stripWhitespace)
	int_corpus <- tm_map(int_corpus, content_transformer(tolower))
	int_corpus <- tm_map(int_corpus, removeWords, stopwords("english"))
	int_corpus <- tm_map(int_corpus, removeNumbers)
	int_corpus <- tm_map(int_corpus, removePunctuation)
	int_corpus <- tm_map(int_corpus, stemDocument)


	# Create matrix, trim & preview
	dtm <- DocumentTermMatrix(int_corpus)
	dtm <- removeSparseTerms(dtm, sparse=0.75)
	dtm.matrix<-as.matrix(dtm)
	head(dtm.matrix)

	#weight by tf/idf
	dtm_weighted <- weightTfIdf(dtm, normalize = TRUE)
	head(as.matrix(dtm_weighted))

	# Frequent terms
	freq <- sort(colSums(as.matrix(dtm)), decreasing=TRUE)
	head(freq)


	# Wordcloud
	wordcloud::wordcloud(names(freq), freq, min.freq=50, max.words=Inf, random.order=FALSE,
	colors=brewer.pal(8, "Accent"), scale=c(7,.4), rot.per=0)

	# Find correlated terms
	findAssocs(dtm,term="impact",0.9)

	# Document correlations
	d <- dist(as.matrix(dtm))
	hc <- hclust(d)
	plot(hc)

	# topic model
	# set a seed so that the output of the model is predictable
	int_lda <- LDA(dtm, k = 3, control = list(seed = 1234))
	int_lda

	int_topics <- tidy(int_lda, matrix = "beta")
	head(int_topics)

	int_top_terms <- int_topics %>%
	group_by(topic) %>%
	top_n(10, beta) %>%
	ungroup() %>%
	arrange(topic, -beta)

	int_top_terms %>%
	mutate(term = reorder(term, beta)) %>%
	ggplot(aes(term, beta, fill = factor(topic))) +
	geom_col(show.legend = FALSE) +
	facet_wrap(~ topic, scales = "free") +
	coord_flip()