library(tidyverse)
library(tidytext)
library(stm)
#> stm v1.3.6 successfully loaded. See ?stm for help.
#> Papers, resources, and other materials at structuraltopicmodel.com
austen_sparse <- janeaustenr::austen_books() %>%
unnest_tokens(word, text) %>%
anti_join(stop_words) %>%
count(book, word) %>%
cast_sparse(book, word, n)
#> Joining, by = "word"
## train topic model
topic_model <- stm(austen_sparse, K = 12, init.type = "Spectral")
#> Beginning Spectral Initialization
#> Calculating the gram matrix...
#> Using only 10000 most frequent terms during initialization...
#> Finding anchor words...
#> ............
#> Recovering initialization...
#> ....................................................................................................
#> Initialization complete.
#> ......
#> Completed E-Step (0 seconds).
#> Completed M-Step.
#> Completing Iteration 1 (approx. per word bound = -7.798)
#> ......
#> Completed E-Step (0 seconds).
#> Completed M-Step.
#> Completing Iteration 2 (approx. per word bound = -7.693, relative change = 1.348e-02)
#> ......
#> Completed E-Step (0 seconds).
#> Completed M-Step.
#> Model Converged
## compute FREX words
logbeta <- topic_model$beta$logbeta[[1]]
word_counts <- topic_model$settings$dim$wcounts$x
vocab <- topic_model$vocab
frex <- calcfrex(logbeta, 0.5, word_counts)
as_tibble(frex) %>%
## top 10 FREX words for each topic
slice_head(n = 10) %>%
## reshape to tidy format
mutate(rank = row_number()) %>%
pivot_longer(V1:V12) %>%
transmute(
topic = str_replace(name, "V", "topic "),
rank,
word = vocab[value]
) %>%
arrange(topic)
#> # A tibble: 120 × 3
#> topic rank word
#> <chr> <int> <chr>
#> 1 topic 1 1 elliot
#> 2 topic 1 2 wentworth
#> 3 topic 1 3 walter
#> 4 topic 1 4 anne
#> 5 topic 1 5 russell
#> 6 topic 1 6 musgrove
#> 7 topic 1 7 louisa
#> 8 topic 1 8 charles
#> 9 topic 1 9 uppercross
#> 10 topic 1 10 kellynch
#> # … with 110 more rows
#> # ℹ Use `print(n = ...)` to see more rows
Created on 2022-08-09 by the reprex package (v2.0.1)