Created
August 12, 2016 14:46
-
-
Save JimGrange/93a3a9f44958152fd4f5e2016ef9c94a to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#------------------------------------------------------------------------------ | |
### set up | |
# clear workspace | |
rm(list = ls()) | |
# set working directory | |
setwd("D:/Work/Blog_YouTube code/Blog/Olympic Medals") | |
# load relevant packages | |
library(rvest) | |
library(stringr) | |
library(dplyr) | |
library(ggplot2) | |
# suppress warnings | |
options(warn = -1) | |
#------------------------------------------------------------------------------ | |
#------------------------------------------------------------------------------ | |
### get a list of all of the host nations | |
# set the url and extract html elements | |
host_url <- "http://www.topendsports.com/events/summer/hosts/list.htm" | |
temp <- host_url %>% | |
html %>% | |
html_nodes("table") | |
# extract the relevant table | |
hosts <- data.frame(html_table(temp[1])) | |
# remove the years that the Olympics were not held | |
hosts <- hosts[!grepl("not held", hosts$Host.City..Country), ] | |
# remove the cities from the host column | |
countries <- hosts$Host.City..Country | |
countries <- gsub(".*,", "", countries) | |
hosts$Host.City..Country <- countries | |
# remove the Olympics that are ongoing (or are yet to occur) and generally | |
# tidy the table up. Also, only select post-1948 games. | |
hosts <- hosts %>% | |
select(-Olympiad) %>% | |
select(year = Year, host = Host.City..Country) %>% | |
filter(year < 2016 & year > 1948) | |
# remove white space from the names | |
hosts$host <- gsub(" ", "", hosts$host, fixed = TRUE) | |
# change host England to Great Britain. | |
# change SouthKorea to South Korea | |
# change USSR to Russia | |
hosts$host <- gsub("England", "Great Britain", hosts$host, fixed = TRUE) | |
hosts$host <- gsub("SouthKorea", "South Korea", hosts$host, fixed = TRUE) | |
hosts$host <- gsub("USSR", "Russia", hosts$host, fixed = TRUE) | |
#------------------------------------------------------------------------------ | |
#------------------------------------------------------------------------------ | |
### get the medal tables for each year and store them in one list | |
# get a vector of all years | |
years <- hosts$year | |
# create a list to store the medal tables | |
medal_tables <- list() | |
# loop over each year and retrieve the data from Wikipedia | |
for(i in 1:length(years)){ | |
# what is the current year? | |
curr_year <- years[i] | |
# construct the relevant URL to the Wikipedia page | |
url <- paste("https://en.wikipedia.org/wiki/", curr_year, | |
"_Summer_Olympics_medal_table", sep = "") | |
# retrieve the data from this page | |
temp <- url %>% | |
html %>% | |
html_nodes("table") | |
# find the html table's position. The medal table is in a "sortable" Wiki | |
# table, so we search for this term and return its position in the list | |
position <- grep("sortable", temp) | |
# get the medal table. Add a new column storing the year | |
medals <- data.frame(html_table(temp[position], fill = TRUE)) | |
medals <- medals %>% | |
mutate(Year = curr_year) | |
# change the names of the "Nation" column, as this is not consistent between | |
# games tables | |
colnames(medals)[2] <- "Nation" | |
# remove the weird symbols from the html file (Â) | |
nations <- medals$Nation | |
nations <- gsub("[^\\x{00}-\\x{7f}]", "", nations, perl = TRUE) | |
# we need to change "Soviet Union" to USSR for consistency | |
nations <- gsub("Soviet Union(URS)", "Russia(RUS)", nations, fixed = TRUE) | |
# also change West & East Germany to "Germany" | |
nations <- gsub("East Germany(GDR)", "Germany(GER)", nations, fixed = TRUE) | |
nations <- gsub("West Germany(FRG)", "Germany(GER)", nations, fixed = TRUE) | |
medals$Nation <- nations | |
# save the medal table and move to the next games | |
medal_tables[[i]] <- medals | |
} | |
#------------------------------------------------------------------------------ | |
#------------------------------------------------------------------------------ | |
### loop over each host, then find how many medals they won in each games and | |
### store it in data frame | |
# initialise the data frame | |
final_data <- data.frame(hosts) | |
final_data[, as.character(years)] <- 0 | |
for(i in 1:length(hosts$host)){ | |
# get the current host | |
curr_host <- hosts$host[i] | |
# loop over all years, find the number of medals won by the current host, | |
# and store it in final_data frame | |
for(j in 1:length(years)){ | |
# what is the current year? | |
curr_year <- years[j] | |
# get the medal table for the current year | |
curr_medals <- medal_tables[[j]] | |
# get the row for the current host if it is present | |
curr_medals <- curr_medals %>% | |
filter(str_detect(Nation, curr_host)) | |
# collate the number of medals won if there is data | |
if(nrow(curr_medals) > 0){ | |
final_data[i, j + 2] <- sum(curr_medals$Total) | |
} else | |
final_data[i, j + 2] <- 0 | |
} # end of each year loop | |
} # end of each host loop | |
#------------------------------------------------------------------------------ | |
#------------------------------------------------------------------------------ | |
### now do some plotting | |
pdf("medals.pdf", width = 12, height = 12) | |
# change the layout of the plotting window | |
par(mfrow = c(4, 4)) | |
# loop over each hosting nation | |
for(i in 1:nrow(final_data)){ | |
# get the current host's data for all years | |
host_data <- as.numeric(final_data[i, 3:ncol(final_data)]) | |
# what is their mean number of medals won? | |
host_mean <- mean(host_data) | |
# plot the data! | |
plot(years, host_data, xlab = "Year", ylab = "Number of Medals", pch = 19, | |
type = "b", lwd = 2, | |
main = paste(hosts$host[i], "–", years[i], sep = "")) | |
abline(v = final_data$year[i], lty = "dashed", col = "blue", lwd = 1.5) | |
abline(h = host_mean, lty = "dashed", col = "red", lwd = 1.5) | |
} | |
#------------------------------------------------------------------------------ |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment