Skip to content

Instantly share code, notes, and snippets.

@abikoushi
Created October 31, 2025 10:00
Show Gist options
  • Select an option

  • Save abikoushi/9a2bd864898344d38a4da65ae76f1de6 to your computer and use it in GitHub Desktop.

Select an option

Save abikoushi/9a2bd864898344d38a4da65ae76f1de6 to your computer and use it in GitHub Desktop.
An example of TF-IDF
library(janeaustenr)
library(tidytext)
library(dplyr)
library(ggplot2)
book_words <- austen_books() %>%
unnest_tokens(word, text) %>%
count(book, word, sort = TRUE) %>%
group_by(book) %>%
mutate(total = sum(n)) %>%
ungroup() %>%
bind_tf_idf(term = word, document = book, n = n) %>%
arrange(desc(tf_idf))
book_tf_idf = mutate(book_words, tf2 = n/total) %>%
group_by(word) %>%
mutate(idf2 = -log(n_distinct(book))) %>%
ungroup() %>%
mutate(idf2 = idf2 +log(n_distinct(book))) %>%
mutate(tf_idf2 = idf2*tf2)
## check
ggplot(book_tf_idf2, aes(tf_idf2 , tf_idf))+
geom_abline(slope = 1, intercept = 0, linetype=2)+
geom_point(alpha=0.2)+
theme_classic(14)
book_top <- book_tf_idf %>%
group_by(book) %>%
slice_max(tf_idf, n = 10) %>%
ungroup()
ggplot(book_top, aes(x = tf_idf, y = fct_reorder(word, tf_idf))) +
geom_point() +
geom_segment( aes(xend=0, yend=fct_reorder(word, tf_idf)))+
facet_wrap(~book, ncol = 2, scales = "free") +
labs(x = "tf-idf", y ="word") +
theme_classic(14)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment