Skip to content

Instantly share code, notes, and snippets.

@paulusm
Last active December 6, 2017 10:11
Show Gist options
  • Save paulusm/fef3dc5e3407cd0f3052cfe62d78bd33 to your computer and use it in GitHub Desktop.
Save paulusm/fef3dc5e3407cd0f3052cfe62d78bd33 to your computer and use it in GitHub Desktop.
# Based on "Introduction to Text Mining with R for Information Professionals"
# http://journal.code4lib.org/articles/11626
# Topic Modelling using Tidy Text from
# http://tidytextmining.com/topicmodeling.html
# Interview data from
# https://figshare.com/authors/Kevin_Sanders/601331
# Uncomment to install these if needed
#install.packages("SnowballC")
#install.packages("wordcloud")
#install.packages("tm")
#install.packages("RColorBrewer")
#install.packages("topicmodels")
#install.packages("tidytext")
#install.packages("dplyr")
#install.packages("ggplot2")
library(tm)
library(SnowballC)
library(wordcloud)
library(topicmodels)
library(tidytext)
library(dplyr)
library(ggplot2)
#library(RColorBrewer)
# Load corpus of texts
int_corpus <- Corpus(DirSource('./corpus'))
# Clean and prepare texts
int_corpus <- tm_map(int_corpus, stripWhitespace)
int_corpus <- tm_map(int_corpus, content_transformer(tolower))
int_corpus <- tm_map(int_corpus, removeWords, stopwords("english"))
int_corpus <- tm_map(int_corpus, removeNumbers)
int_corpus <- tm_map(int_corpus, removePunctuation)
int_corpus <- tm_map(int_corpus, stemDocument)
# Create matrix, trim & preview
dtm <- DocumentTermMatrix(int_corpus)
dtm <- removeSparseTerms(dtm, sparse=0.75)
dtm.matrix<-as.matrix(dtm)
head(dtm.matrix)
#weight by tf/idf
dtm_weighted <- weightTfIdf(dtm, normalize = TRUE)
head(as.matrix(dtm_weighted))
# Frequent terms
freq <- sort(colSums(as.matrix(dtm)), decreasing=TRUE)
head(freq)
# Wordcloud
wordcloud::wordcloud(names(freq), freq, min.freq=50, max.words=Inf, random.order=FALSE,
colors=brewer.pal(8, "Accent"), scale=c(7,.4), rot.per=0)
# Find correlated terms
findAssocs(dtm,term="impact",0.9)
# Document correlations
d <- dist(as.matrix(dtm))
hc <- hclust(d)
plot(hc)
# topic model
# set a seed so that the output of the model is predictable
int_lda <- LDA(dtm, k = 3, control = list(seed = 1234))
int_lda
int_topics <- tidy(int_lda, matrix = "beta")
head(int_topics)
int_top_terms <- int_topics %>%
group_by(topic) %>%
top_n(10, beta) %>%
ungroup() %>%
arrange(topic, -beta)
int_top_terms %>%
mutate(term = reorder(term, beta)) %>%
ggplot(aes(term, beta, fill = factor(topic))) +
geom_col(show.legend = FALSE) +
facet_wrap(~ topic, scales = "free") +
coord_flip()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment