Skip to content

Instantly share code, notes, and snippets.

@CSJCampbell
Created August 16, 2017 15:11
Show Gist options
  • Save CSJCampbell/e88eb99aa63fbdf87765217979938706 to your computer and use it in GitHub Desktop.
Save CSJCampbell/e88eb99aa63fbdf87765217979938706 to your computer and use it in GitHub Desktop.
# @title A Whistle-Stop Tour of the Tidyverse
# @author agott <[email protected]>
library(tidyverse)
# tidy manifesto
## Reuse existing data structures
## Compose simple functions with the pipe
women %>% plot
## Embrace functional programming
## Design for humans
# data analysis workflow
## import
## tidy
## transform
## visualise
## model
## communicate
# Get Olympics Data with rvest
# Funding of Olympic sports
# UK Sport World Class Performance Programme
# Data from Sydney (2000) to Rio de Janeiro (2016)
library(rvest)
library(stringr)
funding <- read_html("http://www.uksport.gov.uk/our-work/investing-in-sport/historical-funding-figures")
catagories <- html_nodes(funding, "strong span") %>%
html_text() %>%
str_subset(pattern = "[aA-zZ]")
funding <- html_table(funding, header = TRUE)
funding <- magrittr::set_names(funding, catagories)
write_rds(funding, "OlympicFunding.rds")
write_csv(funding[[1]], "SummerOlympicsFunding.csv")
write_excel_csv(funding[[1]], "SummerOlympicsFunding.csv")
write_csv(funding[[3]], "WinterOlympicsFunding.csv")
write_excel_csv(funding[[3]], "WinterOlympicsFunding.csv")
# Import the data
summer <- read_csv("SummerOlympicsFunding.csv",
na = c("n/a", "n/a**"),
col_types = cols(.default = col_number(),
Sports = col_character()))
summer <- read_csv("https://gist.githubusercontent.com/CSJCampbell/d4e6edef3596206c48b208696617a93a/raw/a41ea995c152126b07277e30a691390e235d1180/SummerOlympicsFunding.csv",
na = c("n/a", "n/a**"),
col_types = cols(
.default = col_number(),
Sport = col_character())
)
View(summer)
# Create a tibble for location to year map
years <- tribble(~Location, ~Year, ~Month, ~Day,
"Sydney", 2000, 9, 15,
"Athens", 2004, 8, 13,
"Beijing", 2008, 8, 8,
"London", 2012, 7, 27,
"Rio de Janeiro", 2016, 8, 5)
years
# Tidy the summer data
summer <- gather(summer,
key = Location,
value = Funding,
-Sport) %>%
replace_na(replace = list(Funding = 0))
View(summer)
# Join the year data
summer <- full_join(summer, years) %>%
filter(Sport != "Total")
noFunding <- filter(summer, Funding == 0)
View(noFunding)
library(forcats)
numberNoFund <- noFunding %>%
count(Location) %>%
left_join(years) %>%
mutate(Location = fct_reorder(Location, Year))
ggplot(data = numberNoFund, aes(x = Location, y = n)) +
geom_bar(stat = "identity") +
labs(title = "Changes in Number of Sports Not Provided UK Sport Funding",
subtitle = "Funding Provided by UK Sport World Class Performance Programme",
x = "", y = "",
caption = "Data taken from uksport.gov.uk") +
scale_x_discrete(limits = magrittr::use_series(years, Location),
labels = str_c(magrittr::use_series(years, Location),
"\n(", magrittr::use_series(years, Year), ")")) +
theme_classic(base_size = 14, base_family = "sans")
library(lubridate)
summer <- mutate(summer, Date = str_c(Year, Month, Day, sep = "-"),
Date = ymd(Date))
topSports <- summer %>%
group_by(Sport) %>%
arrange(-Funding) %>%
slice(1) %>%
arrange(-Funding) %>%
ungroup() %>%
slice(1:8) %>%
rename(Spt = Sport)
fundingTime <- summer %>%
filter(Sport %in% magrittr::use_series(topSports, Spt))
ggplot(data = fundingTime, aes(x = Date, y = Funding/1e6,
group = Sport,
colour = fct_reorder2(Sport, Date, Funding))) +
geom_line() +
geom_point(size = 2) +
theme_classic(base_size = 14, base_family = "sans") +
theme(legend.position = "right") +
scale_colour_brewer("Sport", palette = "Dark2") +
scale_x_date(date_labels = "%Y",
breaks = ymd(str_c(magrittr::use_series(years, Year), "-01-01"))) +
labs(title = "Overall Increasing Funding of Top Sports",
x = "", y = "Funding (Millions)",
caption = "Data taken from uksport.gov.uk") +
scale_y_continuous(labels = scales::dollar_format("£"))
# Model the Data
fundingModel <- lm(Funding ~ Sport*Year, data = fundingTime)
library(modelr)
modelGrid <- data_grid(fundingTime, Year, Sport)
modelGrid <- modelGrid %>% add_predictions(fundingModel)
ggplot(data = modelGrid, aes(x = Year, y = pred/1e6,
group = Sport,
colour = Sport)) +
geom_line() +
geom_point(data = fundingTime, aes(y = Funding/1e6)) +
labs(title = "Fitted Model for Top Funded Sports",
x = "", y = "Funding (Millions)",
caption = "Data taken from uksport.gov.uk") +
scale_y_continuous(labels = scales::dollar_format("£")) +
scale_x_continuous(breaks = magrittr::use_series(years, Year)) +
theme_classic(base_size = 14, base_family = "sans") +
scale_colour_brewer("Sport", palette = "Dark2")
fundingResid <- fundingTime %>%
add_residuals(fundingModel)
ggplot(data = fundingResid, aes(x = Year, y = resid,
colour = fct_reorder2(Sport, Year, Funding))) +
geom_point() +
theme_classic(base_size = 14, base_family = "sans") +
scale_x_continuous(breaks = magrittr::use_series(years, Year)) +
geom_hline(yintercept = 0, colour = "grey60", linetype = 2) +
labs(title = "Residual Values Suggests Further Fitting Required",
y = "Residual Values", x = "",
caption = "Data taken from uksport.gov.uk") +
scale_colour_brewer("Sport", palette = "Dark2")
# Multiple models with broom
library(broom)
tidy(fundingModel)
glance(fundingModel)
sportData <- fundingTime %>%
group_by(Sport) %>%
nest()
sportsModels <- map(sportData$data, ~lm(Funding ~ Year, data = .))
sportResid <- map2_df(sportData$data, sportsModels, add_residuals, .id = "Sport")
ggplot(data = sportResid,
aes(Year, resid, group = Sport,
colour = Sport)) +
geom_point() +
theme_classic(base_size = 14, base_family = "sans") +
scale_x_continuous(breaks = magrittr::use_series(years, Year)) +
geom_hline(yintercept = 0, colour = "grey60", linetype = 2) +
labs(title = "Residual Values Suggests Further Fitting Required",
y = "Residual Values", x = "",
caption = "Data taken from uksport.gov.uk") +
scale_colour_brewer("Sport", palette = "Dark2",
labels = magrittr::use_series(sportData, Sport))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment