Skip to content

Instantly share code, notes, and snippets.

@juliasilge
Last active December 1, 2020 06:00
Show Gist options
  • Save juliasilge/76086c707db0b471bf92d05c6d8447a5 to your computer and use it in GitHub Desktop.
Save juliasilge/76086c707db0b471bf92d05c6d8447a5 to your computer and use it in GitHub Desktop.
Beyoncé and Taylor Swift Lyrics
library(tidyverse)
library(tidytext)
library(tidylo)
library(silgelib)

beyonce_lyrics <- read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-09-29/beyonce_lyrics.csv')
#> Parsed with column specification:
#> cols(
#>   line = col_character(),
#>   song_id = col_double(),
#>   song_name = col_character(),
#>   artist_id = col_double(),
#>   artist_name = col_character(),
#>   song_line = col_double()
#> )
taylor_swift_lyrics <- read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-09-29/taylor_swift_lyrics.csv')
#> Parsed with column specification:
#> cols(
#>   Artist = col_character(),
#>   Album = col_character(),
#>   Title = col_character(),
#>   Lyrics = col_character()
#> )

tidy_beyonce <- beyonce_lyrics %>%
    unnest_tokens(word, line) %>%
    distinct(word, song_id, .keep_all = TRUE) %>%
    select(word, artist_name)

tidy_swift <- taylor_swift_lyrics %>%
    unnest_tokens(word, Lyrics) %>%
    distinct(word, Title, .keep_all = TRUE) %>%
    select(word, artist_name = Artist)

word_counts <- bind_rows(tidy_beyonce, tidy_swift) %>%
    anti_join(get_stopwords()) %>%
    count(artist_name, word, sort = TRUE) %>%
    filter(!is.na(word))
#> Joining, by = "word"

word_counts
#> # A tibble: 9,168 x 3
#>    artist_name word      n
#>    <chr>       <chr> <int>
#>  1 Beyoncé     know    251
#>  2 Beyoncé     like    238
#>  3 Beyoncé     baby    228
#>  4 Beyoncé     love    226
#>  5 Beyoncé     see     223
#>  6 Beyoncé     just    213
#>  7 Beyoncé     can     206
#>  8 Beyoncé     cause   196
#>  9 Beyoncé     got     196
#> 10 Beyoncé     get     181
#> # … with 9,158 more rows

word_log_odds <- word_counts %>%
    bind_log_odds(artist_name, word, n) 

word_log_odds %>%
    arrange(-log_odds_weighted)
#> # A tibble: 9,168 x 4
#>    artist_name  word       n log_odds_weighted
#>    <chr>        <chr>  <int>             <dbl>
#>  1 Beyoncé      y'all     55              3.39
#>  2 Beyoncé      b         44              3.03
#>  3 Beyoncé      yo        43              3.00
#>  4 Taylor Swift like     101              2.93
#>  5 Taylor Swift said      46              2.92
#>  6 Taylor Swift never     72              2.91
#>  7 Taylor Swift just      92              2.83
#>  8 Taylor Swift back      70              2.69
#>  9 Taylor Swift burned     7              2.66
#> 10 Taylor Swift stupid     7              2.66
#> # … with 9,158 more rows

word_log_odds %>%
    group_by(artist_name) %>%
    slice_max(log_odds_weighted, n = 10) %>%
    ungroup() %>%
    mutate(word = reorder_within(word, log_odds_weighted, artist_name)) %>%
    ggplot(aes(log_odds_weighted, word, color = artist_name)) +
    geom_segment(aes(x = 0, xend = log_odds_weighted, y = word, yend = word),
                 size = 1.8, alpha = 0.7) +
    geom_point(size = 4, show.legend = FALSE) +
    facet_wrap(~artist_name, scales = "free_y") +
    scale_y_reordered() +
    scale_color_manual(values = c("#D55F8F", "#13A699")) +
    scale_x_continuous(expand = c(0, 0.1)) +
    theme_plex() +
    theme(legend.position = "none", panel.grid.major = element_blank()) +
    labs(y = NULL, x = "Weighted log odds (empirical Bayes)",
         title = "What words are most distinctive for Beyoncé and Taylor Swift?",
         subtitle = "For words counted once per song")

Created on 2020-10-01 by the reprex package (v0.3.0.9001)

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment