Last active
August 29, 2015 14:28
-
-
Save tjmahr/6c558a95931170a833ba to your computer and use it in GitHub Desktop.
Getting Pitchfork's Eighties Also-Rans
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Download all the 'See also: Artist: "Title"' entries from | |
# Pitchfork's Top 200 songs of the 80's | |
# http://pitchfork.com/features/staff-lists/9700-the-200-best-songs-of-the-1980s/ | |
library("rvest") | |
library("stringr") | |
library("dplyr", warn.conflicts = FALSE) | |
library("curl") | |
# Given url, get see-also paragraphs and text matching "See also*" | |
scrape_see_also_nodes <- function(url) { | |
# Scrape, extract paragraphs with reviews, extract text, keep just see-alsos | |
html(url) %>% | |
html_nodes(".review-content p") %>% | |
html_text %>% | |
str_subset("See also:") %>% | |
str_extract("See also:.*") | |
} | |
# Given see-also strings, get artist/title. There may be multiple artist - | |
# titles per line. | |
get_tracks_from_text <- function(paragraphs) { | |
lines <- paragraphs %>% | |
str_replace_all("See also:", "") %>% | |
str_replace_all("\\\"", "") %>% | |
# Split "x / y" into c("x", "y") | |
str_split("[/]") %>% unlist %>% | |
# Clean whitespace | |
str_trim | |
# Create table. vector -> matrix -> data.frame -> data_frame | |
tracks <- lines %>% str_split_fixed(": ", 2) %>% | |
as.data.frame %>% | |
as_data_frame %>% | |
rename(Artist = V1, Track = V2) | |
tracks | |
} | |
# Might as well grab the actual rankings too... | |
scrape_main_entries <- function(url) { | |
# Extract artist/title from headings | |
page <- html(url) | |
headings <- page %>% html_nodes(".year-end-review .title") | |
artists <- headings %>% html_nodes("h1") %>% html_text | |
titles <- headings %>% | |
html_nodes("h2") %>% | |
html_text %>% | |
# Remove whitespace and smart quotes | |
str_trim %>% | |
str_replace_all("^.", "") %>% | |
str_replace_all(".$", "") | |
# H3 holds "Label; Year" | |
release <- headings %>% | |
html_nodes("h3") %>% | |
html_text %>% | |
str_split_fixed(";", 2) %>% | |
as.data.frame %>% | |
rename(Label = V1, Year = V2) %>% | |
lapply(str_trim) | |
# Get ranking from the blurb | |
ranks <- page %>% | |
html_nodes(".review-content .rank") %>% | |
html_text | |
data_frame( | |
Rank = as.numeric(ranks), | |
Artist = artists, | |
Track = titles, | |
Year = as.numeric(release$Year), | |
Label = release$Label) | |
} | |
# Listicle is 10 pages long | |
urls <- sprintf( | |
"http://pitchfork.com/features/staff-lists/9700-the-200-best-songs-of-the-1980s/%s/", | |
1:10) | |
# Download each page so we can work from local copy. | |
local_dir <- file.path(getwd(), "pitchfork_80s") | |
local_copies <- file.path(local_dir, paste0(1:10, ".html")) | |
dir.create(local_dir) | |
curl_results <- Map(curl_download, urls, local_copies) | |
# Extract items | |
see_alsos <- local_copies %>% | |
lapply(scrape_see_also_nodes) %>% | |
unlist %>% | |
get_tracks_from_text | |
see_alsos %>% | |
arrange(Artist, Track) | |
#> Source: local data frame [328 x 2] | |
#> | |
#> Artist Track | |
#> 1 A Number of Names Shari Vari | |
#> 2 ABC Be Near Me | |
#> 3 Adolescents Amoeba | |
#> 4 Adonis No Way Back | |
#> 5 Alexander O'Neal What's Missing | |
#> 6 Alexander Robotnick Problèmes d'Amour | |
#> 7 Altered Images Happy Birthday | |
#> 8 Anita Baker Sweet Love | |
#> 9 Anne Clark Our Darkness | |
#> 10 Apollonia 6 Sex Shooter | |
#> .. ... ... | |
# Get the main list as well | |
main_entries <- local_copies %>% | |
lapply(scrape_main_entries) %>% | |
bind_rows %>% | |
arrange(Rank) | |
main_entries | |
#> Source: local data frame [200 x 5] | |
#> | |
#> Rank Artist | |
#> 1 1 Prince and the Revolution | |
#> 2 2 Michael Jackson | |
#> 3 3 N.W.A | |
#> 4 4 New Order | |
#> 5 5 Public Enemy | |
#> 6 6 Kate Bush | |
#> 7 7 Joy Division | |
#> 8 8 Talking Heads | |
#> 9 9 Grandmaster Flash and the Furious Five | |
#> 10 10 The Smiths | |
#> .. ... ... | |
#> Variables not shown: Track (chr), Year (dbl), Label (chr) | |
# I guess "Come On Eileen" is not on Apple Music. | |
filter(main_entries, str_detect(Track, "Eileen")) | |
#> Source: local data frame [0 x 5] | |
#> | |
#> Variables not shown: Rank (dbl), Artist (chr), Track (chr), Year (dbl), | |
#> Label (chr) | |
# Just for fun | |
library("ggplot2") | |
tracks_per_year <- count(main_entries, Year) | |
ggplot(tracks_per_year) + | |
aes(x = factor(Year), y = n) + | |
geom_bar(stat = "identity") + | |
xlab("Year") + | |
ylab("Entries from year") + | |
ggtitle("1988 was peak 80's") |
Author
tjmahr
commented
Aug 26, 2015
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment