Skip to content

Instantly share code, notes, and snippets.

@tjmahr
Last active August 29, 2015 14:28
Show Gist options
  • Save tjmahr/6c558a95931170a833ba to your computer and use it in GitHub Desktop.
Save tjmahr/6c558a95931170a833ba to your computer and use it in GitHub Desktop.
Getting Pitchfork's Eighties Also-Rans
# Download all the 'See also: Artist: "Title"' entries from
# Pitchfork's Top 200 songs of the 80's
# http://pitchfork.com/features/staff-lists/9700-the-200-best-songs-of-the-1980s/
library("rvest")
library("stringr")
library("dplyr", warn.conflicts = FALSE)
library("curl")
# Given url, get see-also paragraphs and text matching "See also*"
scrape_see_also_nodes <- function(url) {
# Scrape, extract paragraphs with reviews, extract text, keep just see-alsos
html(url) %>%
html_nodes(".review-content p") %>%
html_text %>%
str_subset("See also:") %>%
str_extract("See also:.*")
}
# Given see-also strings, get artist/title. There may be multiple artist -
# titles per line.
get_tracks_from_text <- function(paragraphs) {
lines <- paragraphs %>%
str_replace_all("See also:", "") %>%
str_replace_all("\\\"", "") %>%
# Split "x / y" into c("x", "y")
str_split("[/]") %>% unlist %>%
# Clean whitespace
str_trim
# Create table. vector -> matrix -> data.frame -> data_frame
tracks <- lines %>% str_split_fixed(": ", 2) %>%
as.data.frame %>%
as_data_frame %>%
rename(Artist = V1, Track = V2)
tracks
}
# Might as well grab the actual rankings too...
scrape_main_entries <- function(url) {
# Extract artist/title from headings
page <- html(url)
headings <- page %>% html_nodes(".year-end-review .title")
artists <- headings %>% html_nodes("h1") %>% html_text
titles <- headings %>%
html_nodes("h2") %>%
html_text %>%
# Remove whitespace and smart quotes
str_trim %>%
str_replace_all("^.", "") %>%
str_replace_all(".$", "")
# H3 holds "Label; Year"
release <- headings %>%
html_nodes("h3") %>%
html_text %>%
str_split_fixed(";", 2) %>%
as.data.frame %>%
rename(Label = V1, Year = V2) %>%
lapply(str_trim)
# Get ranking from the blurb
ranks <- page %>%
html_nodes(".review-content .rank") %>%
html_text
data_frame(
Rank = as.numeric(ranks),
Artist = artists,
Track = titles,
Year = as.numeric(release$Year),
Label = release$Label)
}
# Listicle is 10 pages long
urls <- sprintf(
"http://pitchfork.com/features/staff-lists/9700-the-200-best-songs-of-the-1980s/%s/",
1:10)
# Download each page so we can work from local copy.
local_dir <- file.path(getwd(), "pitchfork_80s")
local_copies <- file.path(local_dir, paste0(1:10, ".html"))
dir.create(local_dir)
curl_results <- Map(curl_download, urls, local_copies)
# Extract items
see_alsos <- local_copies %>%
lapply(scrape_see_also_nodes) %>%
unlist %>%
get_tracks_from_text
see_alsos %>%
arrange(Artist, Track)
#> Source: local data frame [328 x 2]
#>
#> Artist Track
#> 1 A Number of Names Shari Vari
#> 2 ABC Be Near Me
#> 3 Adolescents Amoeba
#> 4 Adonis No Way Back
#> 5 Alexander O'Neal What's Missing
#> 6 Alexander Robotnick Problèmes d'Amour
#> 7 Altered Images Happy Birthday
#> 8 Anita Baker Sweet Love
#> 9 Anne Clark Our Darkness
#> 10 Apollonia 6 Sex Shooter
#> .. ... ...
# Get the main list as well
main_entries <- local_copies %>%
lapply(scrape_main_entries) %>%
bind_rows %>%
arrange(Rank)
main_entries
#> Source: local data frame [200 x 5]
#>
#> Rank Artist
#> 1 1 Prince and the Revolution
#> 2 2 Michael Jackson
#> 3 3 N.W.A
#> 4 4 New Order
#> 5 5 Public Enemy
#> 6 6 Kate Bush
#> 7 7 Joy Division
#> 8 8 Talking Heads
#> 9 9 Grandmaster Flash and the Furious Five
#> 10 10 The Smiths
#> .. ... ...
#> Variables not shown: Track (chr), Year (dbl), Label (chr)
# I guess "Come On Eileen" is not on Apple Music.
filter(main_entries, str_detect(Track, "Eileen"))
#> Source: local data frame [0 x 5]
#>
#> Variables not shown: Rank (dbl), Artist (chr), Track (chr), Year (dbl),
#> Label (chr)
# Just for fun
library("ggplot2")
tracks_per_year <- count(main_entries, Year)
ggplot(tracks_per_year) +
aes(x = factor(Year), y = n) +
geom_bar(stat = "identity") +
xlab("Year") +
ylab("Entries from year") +
ggtitle("1988 was peak 80's")
@tjmahr
Copy link
Author

tjmahr commented Aug 26, 2015

tracks_per_year

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment