Created
July 26, 2019 16:15
-
-
Save MattSandy/77e9780b650ba8da145d07d6fa899874 to your computer and use it in GitHub Desktop.
Find the cumulative number of albums an artist has released and predict future counts.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# https://en.wikipedia.org/wiki/The_Mountain_Goats_discography | |
# Scraping | |
library("rvest") | |
library("ggplot2") | |
library("ggthemes") | |
library("stringr") | |
library("tidyverse") | |
tbls <- "https://en.wikipedia.org/wiki/The_Mountain_Goats_discography" %>% | |
read_html %>% html_nodes("table") | |
# Shows all tables | |
tbls | |
# Matched more than one, so using the first result when assigning value to df | |
tbls[grep("Zopilote Machine",tbls,ignore.case = T)] | |
# Get those albums | |
df <- html_table(tbls[grep("Zopilote Machine",tbls,ignore.case = T)[1]],fill = T)[[1]] | |
# Find a sequence of four numbers for year | |
df$Year <- str_extract_all(df$`Album details`,"[0-9]{4}") %>% as.numeric | |
# Drop first row because it is a duplicate of the header (wiki table thing) | |
df <- df[-1,c("Title","Album details","Year")] | |
df_plot <- min(df$Year):max(df$Year) %>% lapply(function(year){ | |
sum(df$Year<=year) | |
}) %>% | |
unlist %>% | |
data.frame(Year = min(df$Year):max(df$Year)) %>% select(Year = "Year",Albums=".") | |
ggplot(df_plot,aes(x=Year,y=Albums)) + | |
geom_bar(stat="identity") + theme_fivethirtyeight() + | |
scale_x_continuous(breaks=seq(min(df_plot$Year),max(df_plot$Year)+40,5)) + | |
xlim(min(df_plot$Year)-1,2040) + | |
stat_smooth(method="lm",fullrange=TRUE) + | |
# Give back those axis that theme_fivethirtyeight removed | |
theme(axis.title = element_text()) + | |
labs(title = "Mountain Goats Albums by Year", | |
subtitle = "Studio Album Releases from https://en.wikipedia.org/wiki/The_Mountain_Goats_discography", | |
caption = "\n@appupio") + | |
ylab("Number of Albums\n") + xlab("Year") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment