library(tidyverse)
library(scales)
library(tidytext)
# Add a categorical version of the season column
theoffice <- read_csv("data/theoffice.csv") |>
mutate(season_cat = factor(season))
all_words <- theoffice |>
unnest_tokens(output = word, input = text) |>
anti_join(stop_words, by = join_by(word))
character_tf_idf <- all_words |>
filter(character %in% c("Michael", "Jim", "Pam", "Dwight")) |>
count(character, word, sort = TRUE) |>
bind_tf_idf(word, character, n)
most_unique_by_character <- character_tf_idf |>
group_by(character) |>
slice_max(tf_idf, n = 10) |>
ungroup() |>
mutate(word = reorder_within(word, tf_idf, character))
ggplot(most_unique_by_character, aes(y = word, x = tf_idf, fill = character)) +
geom_col() +
guides(fill = "none") +
scale_y_reordered() +
facet_wrap(vars(character), scales = "free") +
labs(
x = NULL,
y = NULL,
title = "Top 10 most unique words spoken by main characters",
subtitle = str_wrap(
glue::glue(
"Values are term frequency-inverse document frequency (tf-idf) scores,
where each character's words constitute a document. On their own,
raw tf-idf scores are impossible to interpret; higher values indicate
words that appear frequently for a character but rarely for others."
),
width = 70
)
) +
theme_bw() +
theme(
plot.title = element_text(face = "bold"),
axis.text.x = element_blank(),
axis.ticks.x = element_blank()
)
Created
April 28, 2026 04:25
-
-
Save andrewheiss/a60e4f693718570c732672181083fbc7 to your computer and use it in GitHub Desktop.
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
