Created
August 16, 2017 15:11
-
-
Save CSJCampbell/e88eb99aa63fbdf87765217979938706 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# @title A Whistle-Stop Tour of the Tidyverse | |
# @author agott <[email protected]> | |
library(tidyverse) | |
# tidy manifesto | |
## Reuse existing data structures | |
## Compose simple functions with the pipe | |
women %>% plot | |
## Embrace functional programming | |
## Design for humans | |
# data analysis workflow | |
## import | |
## tidy | |
## transform | |
## visualise | |
## model | |
## communicate | |
# Get Olympics Data with rvest | |
# Funding of Olympic sports | |
# UK Sport World Class Performance Programme | |
# Data from Sydney (2000) to Rio de Janeiro (2016) | |
library(rvest) | |
library(stringr) | |
funding <- read_html("http://www.uksport.gov.uk/our-work/investing-in-sport/historical-funding-figures") | |
catagories <- html_nodes(funding, "strong span") %>% | |
html_text() %>% | |
str_subset(pattern = "[aA-zZ]") | |
funding <- html_table(funding, header = TRUE) | |
funding <- magrittr::set_names(funding, catagories) | |
write_rds(funding, "OlympicFunding.rds") | |
write_csv(funding[[1]], "SummerOlympicsFunding.csv") | |
write_excel_csv(funding[[1]], "SummerOlympicsFunding.csv") | |
write_csv(funding[[3]], "WinterOlympicsFunding.csv") | |
write_excel_csv(funding[[3]], "WinterOlympicsFunding.csv") | |
# Import the data | |
summer <- read_csv("SummerOlympicsFunding.csv", | |
na = c("n/a", "n/a**"), | |
col_types = cols(.default = col_number(), | |
Sports = col_character())) | |
summer <- read_csv("https://gist.githubusercontent.com/CSJCampbell/d4e6edef3596206c48b208696617a93a/raw/a41ea995c152126b07277e30a691390e235d1180/SummerOlympicsFunding.csv", | |
na = c("n/a", "n/a**"), | |
col_types = cols( | |
.default = col_number(), | |
Sport = col_character()) | |
) | |
View(summer) | |
# Create a tibble for location to year map | |
years <- tribble(~Location, ~Year, ~Month, ~Day, | |
"Sydney", 2000, 9, 15, | |
"Athens", 2004, 8, 13, | |
"Beijing", 2008, 8, 8, | |
"London", 2012, 7, 27, | |
"Rio de Janeiro", 2016, 8, 5) | |
years | |
# Tidy the summer data | |
summer <- gather(summer, | |
key = Location, | |
value = Funding, | |
-Sport) %>% | |
replace_na(replace = list(Funding = 0)) | |
View(summer) | |
# Join the year data | |
summer <- full_join(summer, years) %>% | |
filter(Sport != "Total") | |
noFunding <- filter(summer, Funding == 0) | |
View(noFunding) | |
library(forcats) | |
numberNoFund <- noFunding %>% | |
count(Location) %>% | |
left_join(years) %>% | |
mutate(Location = fct_reorder(Location, Year)) | |
ggplot(data = numberNoFund, aes(x = Location, y = n)) + | |
geom_bar(stat = "identity") + | |
labs(title = "Changes in Number of Sports Not Provided UK Sport Funding", | |
subtitle = "Funding Provided by UK Sport World Class Performance Programme", | |
x = "", y = "", | |
caption = "Data taken from uksport.gov.uk") + | |
scale_x_discrete(limits = magrittr::use_series(years, Location), | |
labels = str_c(magrittr::use_series(years, Location), | |
"\n(", magrittr::use_series(years, Year), ")")) + | |
theme_classic(base_size = 14, base_family = "sans") | |
library(lubridate) | |
summer <- mutate(summer, Date = str_c(Year, Month, Day, sep = "-"), | |
Date = ymd(Date)) | |
topSports <- summer %>% | |
group_by(Sport) %>% | |
arrange(-Funding) %>% | |
slice(1) %>% | |
arrange(-Funding) %>% | |
ungroup() %>% | |
slice(1:8) %>% | |
rename(Spt = Sport) | |
fundingTime <- summer %>% | |
filter(Sport %in% magrittr::use_series(topSports, Spt)) | |
ggplot(data = fundingTime, aes(x = Date, y = Funding/1e6, | |
group = Sport, | |
colour = fct_reorder2(Sport, Date, Funding))) + | |
geom_line() + | |
geom_point(size = 2) + | |
theme_classic(base_size = 14, base_family = "sans") + | |
theme(legend.position = "right") + | |
scale_colour_brewer("Sport", palette = "Dark2") + | |
scale_x_date(date_labels = "%Y", | |
breaks = ymd(str_c(magrittr::use_series(years, Year), "-01-01"))) + | |
labs(title = "Overall Increasing Funding of Top Sports", | |
x = "", y = "Funding (Millions)", | |
caption = "Data taken from uksport.gov.uk") + | |
scale_y_continuous(labels = scales::dollar_format("£")) | |
# Model the Data | |
fundingModel <- lm(Funding ~ Sport*Year, data = fundingTime) | |
library(modelr) | |
modelGrid <- data_grid(fundingTime, Year, Sport) | |
modelGrid <- modelGrid %>% add_predictions(fundingModel) | |
ggplot(data = modelGrid, aes(x = Year, y = pred/1e6, | |
group = Sport, | |
colour = Sport)) + | |
geom_line() + | |
geom_point(data = fundingTime, aes(y = Funding/1e6)) + | |
labs(title = "Fitted Model for Top Funded Sports", | |
x = "", y = "Funding (Millions)", | |
caption = "Data taken from uksport.gov.uk") + | |
scale_y_continuous(labels = scales::dollar_format("£")) + | |
scale_x_continuous(breaks = magrittr::use_series(years, Year)) + | |
theme_classic(base_size = 14, base_family = "sans") + | |
scale_colour_brewer("Sport", palette = "Dark2") | |
fundingResid <- fundingTime %>% | |
add_residuals(fundingModel) | |
ggplot(data = fundingResid, aes(x = Year, y = resid, | |
colour = fct_reorder2(Sport, Year, Funding))) + | |
geom_point() + | |
theme_classic(base_size = 14, base_family = "sans") + | |
scale_x_continuous(breaks = magrittr::use_series(years, Year)) + | |
geom_hline(yintercept = 0, colour = "grey60", linetype = 2) + | |
labs(title = "Residual Values Suggests Further Fitting Required", | |
y = "Residual Values", x = "", | |
caption = "Data taken from uksport.gov.uk") + | |
scale_colour_brewer("Sport", palette = "Dark2") | |
# Multiple models with broom | |
library(broom) | |
tidy(fundingModel) | |
glance(fundingModel) | |
sportData <- fundingTime %>% | |
group_by(Sport) %>% | |
nest() | |
sportsModels <- map(sportData$data, ~lm(Funding ~ Year, data = .)) | |
sportResid <- map2_df(sportData$data, sportsModels, add_residuals, .id = "Sport") | |
ggplot(data = sportResid, | |
aes(Year, resid, group = Sport, | |
colour = Sport)) + | |
geom_point() + | |
theme_classic(base_size = 14, base_family = "sans") + | |
scale_x_continuous(breaks = magrittr::use_series(years, Year)) + | |
geom_hline(yintercept = 0, colour = "grey60", linetype = 2) + | |
labs(title = "Residual Values Suggests Further Fitting Required", | |
y = "Residual Values", x = "", | |
caption = "Data taken from uksport.gov.uk") + | |
scale_colour_brewer("Sport", palette = "Dark2", | |
labels = magrittr::use_series(sportData, Sport)) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment