office-tf-idf.md

library(tidyverse)
library(scales)
library(tidytext)

# Add a categorical version of the season column
theoffice <- read_csv("data/theoffice.csv") |> 
  mutate(season_cat = factor(season))

all_words <- theoffice |> 
  unnest_tokens(output = word, input = text) |> 
  anti_join(stop_words, by = join_by(word))

character_tf_idf <- all_words |>
  filter(character %in% c("Michael", "Jim", "Pam", "Dwight")) |>
  count(character, word, sort = TRUE) |>
  bind_tf_idf(word, character, n)

most_unique_by_character <- character_tf_idf |>
  group_by(character) |>
  slice_max(tf_idf, n = 10) |>
  ungroup() |>
  mutate(word = reorder_within(word, tf_idf, character))

ggplot(most_unique_by_character, aes(y = word, x = tf_idf, fill = character)) +
  geom_col() +
  guides(fill = "none") +
  scale_y_reordered() +
  facet_wrap(vars(character), scales = "free") +
  labs(
    x = NULL,
    y = NULL,
    title = "Top 10 most unique words spoken by main characters",
    subtitle = str_wrap(
      glue::glue(
        "Values are term frequency-inverse document frequency (tf-idf) scores, 
         where each character's words constitute a document. On their own, 
         raw tf-idf scores are impossible to interpret; higher values indicate 
         words that appear frequently for a character but rarely for others."
      ),
      width = 70
    )
  ) +
  theme_bw() +
  theme(
    plot.title = element_text(face = "bold"),
    axis.text.x = element_blank(),
    axis.ticks.x = element_blank()
  )
andrewheiss/office-tf-idf.md

Select an option

No results found

Select an option

No results found