library(tidyverse)
library(tidytext)
library(tidylo)
library(silgelib)
beyonce_lyrics <- read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-09-29/beyonce_lyrics.csv')
#> Parsed with column specification:
#> cols(
#> line = col_character(),
#> song_id = col_double(),
#> song_name = col_character(),
#> artist_id = col_double(),
#> artist_name = col_character(),
#> song_line = col_double()
#> )
taylor_swift_lyrics <- read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-09-29/taylor_swift_lyrics.csv')
#> Parsed with column specification:
#> cols(
#> Artist = col_character(),
#> Album = col_character(),
#> Title = col_character(),
#> Lyrics = col_character()
#> )
tidy_beyonce <- beyonce_lyrics %>%
unnest_tokens(word, line) %>%
distinct(word, song_id, .keep_all = TRUE) %>%
select(word, artist_name)
tidy_swift <- taylor_swift_lyrics %>%
unnest_tokens(word, Lyrics) %>%
distinct(word, Title, .keep_all = TRUE) %>%
select(word, artist_name = Artist)
word_counts <- bind_rows(tidy_beyonce, tidy_swift) %>%
anti_join(get_stopwords()) %>%
count(artist_name, word, sort = TRUE) %>%
filter(!is.na(word))
#> Joining, by = "word"
word_counts
#> # A tibble: 9,168 x 3
#> artist_name word n
#> <chr> <chr> <int>
#> 1 Beyoncé know 251
#> 2 Beyoncé like 238
#> 3 Beyoncé baby 228
#> 4 Beyoncé love 226
#> 5 Beyoncé see 223
#> 6 Beyoncé just 213
#> 7 Beyoncé can 206
#> 8 Beyoncé cause 196
#> 9 Beyoncé got 196
#> 10 Beyoncé get 181
#> # … with 9,158 more rows
word_log_odds <- word_counts %>%
bind_log_odds(artist_name, word, n)
word_log_odds %>%
arrange(-log_odds_weighted)
#> # A tibble: 9,168 x 4
#> artist_name word n log_odds_weighted
#> <chr> <chr> <int> <dbl>
#> 1 Beyoncé y'all 55 3.39
#> 2 Beyoncé b 44 3.03
#> 3 Beyoncé yo 43 3.00
#> 4 Taylor Swift like 101 2.93
#> 5 Taylor Swift said 46 2.92
#> 6 Taylor Swift never 72 2.91
#> 7 Taylor Swift just 92 2.83
#> 8 Taylor Swift back 70 2.69
#> 9 Taylor Swift burned 7 2.66
#> 10 Taylor Swift stupid 7 2.66
#> # … with 9,158 more rows
word_log_odds %>%
group_by(artist_name) %>%
slice_max(log_odds_weighted, n = 10) %>%
ungroup() %>%
mutate(word = reorder_within(word, log_odds_weighted, artist_name)) %>%
ggplot(aes(log_odds_weighted, word, color = artist_name)) +
geom_segment(aes(x = 0, xend = log_odds_weighted, y = word, yend = word),
size = 1.8, alpha = 0.7) +
geom_point(size = 4, show.legend = FALSE) +
facet_wrap(~artist_name, scales = "free_y") +
scale_y_reordered() +
scale_color_manual(values = c("#D55F8F", "#13A699")) +
scale_x_continuous(expand = c(0, 0.1)) +
theme_plex() +
theme(legend.position = "none", panel.grid.major = element_blank()) +
labs(y = NULL, x = "Weighted log odds (empirical Bayes)",
title = "What words are most distinctive for Beyoncé and Taylor Swift?",
subtitle = "For words counted once per song")
Created on 2020-10-01 by the reprex package (v0.3.0.9001)