Last active
December 6, 2017 10:11
-
-
Save paulusm/fef3dc5e3407cd0f3052cfe62d78bd33 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Based on "Introduction to Text Mining with R for Information Professionals" | |
# http://journal.code4lib.org/articles/11626 | |
# Topic Modelling using Tidy Text from | |
# http://tidytextmining.com/topicmodeling.html | |
# Interview data from | |
# https://figshare.com/authors/Kevin_Sanders/601331 | |
# Uncomment to install these if needed | |
#install.packages("SnowballC") | |
#install.packages("wordcloud") | |
#install.packages("tm") | |
#install.packages("RColorBrewer") | |
#install.packages("topicmodels") | |
#install.packages("tidytext") | |
#install.packages("dplyr") | |
#install.packages("ggplot2") | |
library(tm) | |
library(SnowballC) | |
library(wordcloud) | |
library(topicmodels) | |
library(tidytext) | |
library(dplyr) | |
library(ggplot2) | |
#library(RColorBrewer) | |
# Load corpus of texts | |
int_corpus <- Corpus(DirSource('./corpus')) | |
# Clean and prepare texts | |
int_corpus <- tm_map(int_corpus, stripWhitespace) | |
int_corpus <- tm_map(int_corpus, content_transformer(tolower)) | |
int_corpus <- tm_map(int_corpus, removeWords, stopwords("english")) | |
int_corpus <- tm_map(int_corpus, removeNumbers) | |
int_corpus <- tm_map(int_corpus, removePunctuation) | |
int_corpus <- tm_map(int_corpus, stemDocument) | |
# Create matrix, trim & preview | |
dtm <- DocumentTermMatrix(int_corpus) | |
dtm <- removeSparseTerms(dtm, sparse=0.75) | |
dtm.matrix<-as.matrix(dtm) | |
head(dtm.matrix) | |
#weight by tf/idf | |
dtm_weighted <- weightTfIdf(dtm, normalize = TRUE) | |
head(as.matrix(dtm_weighted)) | |
# Frequent terms | |
freq <- sort(colSums(as.matrix(dtm)), decreasing=TRUE) | |
head(freq) | |
# Wordcloud | |
wordcloud::wordcloud(names(freq), freq, min.freq=50, max.words=Inf, random.order=FALSE, | |
colors=brewer.pal(8, "Accent"), scale=c(7,.4), rot.per=0) | |
# Find correlated terms | |
findAssocs(dtm,term="impact",0.9) | |
# Document correlations | |
d <- dist(as.matrix(dtm)) | |
hc <- hclust(d) | |
plot(hc) | |
# topic model | |
# set a seed so that the output of the model is predictable | |
int_lda <- LDA(dtm, k = 3, control = list(seed = 1234)) | |
int_lda | |
int_topics <- tidy(int_lda, matrix = "beta") | |
head(int_topics) | |
int_top_terms <- int_topics %>% | |
group_by(topic) %>% | |
top_n(10, beta) %>% | |
ungroup() %>% | |
arrange(topic, -beta) | |
int_top_terms %>% | |
mutate(term = reorder(term, beta)) %>% | |
ggplot(aes(term, beta, fill = factor(topic))) + | |
geom_col(show.legend = FALSE) + | |
facet_wrap(~ topic, scales = "free") + | |
coord_flip() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment