perthr · March 30, 2020 07:31
diff --git a/SpotifyWebScraper.Rmd b/SpotifyWebScraper.Rmd
 ```{r}
 #packages

 library(rvest)
 library(tidyverse)
 library(magrittr)
 library(scales)
 library(knitr)
 library(lubridate)
 ```

 ```{r}
 #make our constant
 url <- "https://spotifycharts.com/regional/ca/daily/"

 #make the variable and since we are scraping dates and they are the variable factor in the url we are scraping
 timevalues <- seq(as.Date("2018/02/01"), as.Date("2018/02/28"), by = "day")
 ```

 ```{r}
 #make a function we'll apply to combine, use paste0 which defaults our sep = "" (nothing) and collapse

 unitedata <- function(x){
  full_url <- paste0(url, x)
  full_url
 }

 finalurl <- unitedata(timevalues)
 finalurl
 ```

 ```{r}
 #designing the scraper fucntion to scower the website for html nodes containing datas we want with Selector Gadget

 SpotifyScrape <- function(x){
 page <- x
 rank <- page %>% read_html() %>% html_nodes('.chart-table-position') %>% html_text() %>% as.data.frame()
 track <- page %>% read_html() %>% html_nodes('strong') %>% html_text() %>% as.data.frame()
 artist <- page %>% read_html() %>% html_nodes('.chart-table-track span') %>% html_text() %>% as.data.frame()
 streams <- page %>% read_html() %>% html_nodes('td.chart-table-streams') %>% html_text() %>% as.data.frame()
 dates <- page %>% read_html() %>% html_nodes('.responsive-select~ .responsive-select+ .responsive-select .responsive-select-value') %>% html_text() %>% as.data.frame()

 #combining, naming, classifying our variables
 chart <- cbind(rank, track, artist, streams, dates)
 names(chart) <- c("Rank", "Track", "Artist", "Streams", "Date")
 chart <- as.tibble(chart)
 return(chart)
 }
 ```

 ```{r}
 #the scraper, let run for a few minutes

 spotify <- map_df(finalurl, SpotifyScrape)
 ```

 ```{r}
 #cleaning

 spotify %<>% mutate(Artist = gsub("by ", "", Artist), 
                    Streams = gsub(",", "", Streams), 
                    Streams = as.numeric(Streams), 
                    Date = as.Date(spotify$Date, "%m/%d/%Y"))
 ```

 ```{r}
 #top artist for February

 spotify %>% 
 group_by(Artist) %>% 
 summarise(Total = sum(Streams)) %>% 
 arrange(desc(Total)) %>%
 top_n(25, Total) %>%
 ggplot() +
 geom_col(aes(x = reorder(Artist, Total), y = Total), fill = "forest green") +
 coord_flip() + 
 scale_y_continuous(labels = unit_format("B", 1e-9))
 ```
	```{r}
	#packages

	library(rvest)
	library(tidyverse)
	library(magrittr)
	library(scales)
	library(knitr)
	library(lubridate)
	```

	```{r}
	#make our constant
	url <- "https://spotifycharts.com/regional/ca/daily/"

	#make the variable and since we are scraping dates and they are the variable factor in the url we are scraping
	timevalues <- seq(as.Date("2018/02/01"), as.Date("2018/02/28"), by = "day")
	```

	```{r}
	#make a function we'll apply to combine, use paste0 which defaults our sep = "" (nothing) and collapse

	unitedata <- function(x){
	full_url <- paste0(url, x)
	full_url
	}

	finalurl <- unitedata(timevalues)
	finalurl
	```

	```{r}
	#designing the scraper fucntion to scower the website for html nodes containing datas we want with Selector Gadget

	SpotifyScrape <- function(x){
	page <- x
	rank <- page %>% read_html() %>% html_nodes('.chart-table-position') %>% html_text() %>% as.data.frame()
	track <- page %>% read_html() %>% html_nodes('strong') %>% html_text() %>% as.data.frame()
	artist <- page %>% read_html() %>% html_nodes('.chart-table-track span') %>% html_text() %>% as.data.frame()
	streams <- page %>% read_html() %>% html_nodes('td.chart-table-streams') %>% html_text() %>% as.data.frame()
	dates <- page %>% read_html() %>% html_nodes('.responsive-select~ .responsive-select+ .responsive-select .responsive-select-value') %>% html_text() %>% as.data.frame()

	#combining, naming, classifying our variables
	chart <- cbind(rank, track, artist, streams, dates)
	names(chart) <- c("Rank", "Track", "Artist", "Streams", "Date")
	chart <- as.tibble(chart)
	return(chart)
	}
	```

	```{r}
	#the scraper, let run for a few minutes

	spotify <- map_df(finalurl, SpotifyScrape)
	```

	```{r}
	#cleaning

	spotify %<>% mutate(Artist = gsub("by ", "", Artist),
	Streams = gsub(",", "", Streams),
	Streams = as.numeric(Streams),
	Date = as.Date(spotify$Date, "%m/%d/%Y"))
	```

	```{r}
	#top artist for February

	spotify %>%
	group_by(Artist) %>%
	summarise(Total = sum(Streams)) %>%
	arrange(desc(Total)) %>%
	top_n(25, Total) %>%
	ggplot() +
	geom_col(aes(x = reorder(Artist, Total), y = Total), fill = "forest green") +
	coord_flip() +
	scale_y_continuous(labels = unit_format("B", 1e-9))
	```