Skip to content

Instantly share code, notes, and snippets.

@JimGrange
Created August 12, 2016 14:46
Show Gist options
  • Save JimGrange/93a3a9f44958152fd4f5e2016ef9c94a to your computer and use it in GitHub Desktop.
Save JimGrange/93a3a9f44958152fd4f5e2016ef9c94a to your computer and use it in GitHub Desktop.
#------------------------------------------------------------------------------
### set up
# clear workspace
rm(list = ls())
# set working directory
setwd("D:/Work/Blog_YouTube code/Blog/Olympic Medals")
# load relevant packages
library(rvest)
library(stringr)
library(dplyr)
library(ggplot2)
# suppress warnings
options(warn = -1)
#------------------------------------------------------------------------------
#------------------------------------------------------------------------------
### get a list of all of the host nations
# set the url and extract html elements
host_url <- "http://www.topendsports.com/events/summer/hosts/list.htm"
temp <- host_url %>%
html %>%
html_nodes("table")
# extract the relevant table
hosts <- data.frame(html_table(temp[1]))
# remove the years that the Olympics were not held
hosts <- hosts[!grepl("not held", hosts$Host.City..Country), ]
# remove the cities from the host column
countries <- hosts$Host.City..Country
countries <- gsub(".*,", "", countries)
hosts$Host.City..Country <- countries
# remove the Olympics that are ongoing (or are yet to occur) and generally
# tidy the table up. Also, only select post-1948 games.
hosts <- hosts %>%
select(-Olympiad) %>%
select(year = Year, host = Host.City..Country) %>%
filter(year < 2016 & year > 1948)
# remove white space from the names
hosts$host <- gsub(" ", "", hosts$host, fixed = TRUE)
# change host England to Great Britain.
# change SouthKorea to South Korea
# change USSR to Russia
hosts$host <- gsub("England", "Great Britain", hosts$host, fixed = TRUE)
hosts$host <- gsub("SouthKorea", "South Korea", hosts$host, fixed = TRUE)
hosts$host <- gsub("USSR", "Russia", hosts$host, fixed = TRUE)
#------------------------------------------------------------------------------
#------------------------------------------------------------------------------
### get the medal tables for each year and store them in one list
# get a vector of all years
years <- hosts$year
# create a list to store the medal tables
medal_tables <- list()
# loop over each year and retrieve the data from Wikipedia
for(i in 1:length(years)){
# what is the current year?
curr_year <- years[i]
# construct the relevant URL to the Wikipedia page
url <- paste("https://en.wikipedia.org/wiki/", curr_year,
"_Summer_Olympics_medal_table", sep = "")
# retrieve the data from this page
temp <- url %>%
html %>%
html_nodes("table")
# find the html table's position. The medal table is in a "sortable" Wiki
# table, so we search for this term and return its position in the list
position <- grep("sortable", temp)
# get the medal table. Add a new column storing the year
medals <- data.frame(html_table(temp[position], fill = TRUE))
medals <- medals %>%
mutate(Year = curr_year)
# change the names of the "Nation" column, as this is not consistent between
# games tables
colnames(medals)[2] <- "Nation"
# remove the weird symbols from the html file (Â)
nations <- medals$Nation
nations <- gsub("[^\\x{00}-\\x{7f}]", "", nations, perl = TRUE)
# we need to change "Soviet Union" to USSR for consistency
nations <- gsub("Soviet Union(URS)", "Russia(RUS)", nations, fixed = TRUE)
# also change West & East Germany to "Germany"
nations <- gsub("East Germany(GDR)", "Germany(GER)", nations, fixed = TRUE)
nations <- gsub("West Germany(FRG)", "Germany(GER)", nations, fixed = TRUE)
medals$Nation <- nations
# save the medal table and move to the next games
medal_tables[[i]] <- medals
}
#------------------------------------------------------------------------------
#------------------------------------------------------------------------------
### loop over each host, then find how many medals they won in each games and
### store it in data frame
# initialise the data frame
final_data <- data.frame(hosts)
final_data[, as.character(years)] <- 0
for(i in 1:length(hosts$host)){
# get the current host
curr_host <- hosts$host[i]
# loop over all years, find the number of medals won by the current host,
# and store it in final_data frame
for(j in 1:length(years)){
# what is the current year?
curr_year <- years[j]
# get the medal table for the current year
curr_medals <- medal_tables[[j]]
# get the row for the current host if it is present
curr_medals <- curr_medals %>%
filter(str_detect(Nation, curr_host))
# collate the number of medals won if there is data
if(nrow(curr_medals) > 0){
final_data[i, j + 2] <- sum(curr_medals$Total)
} else
final_data[i, j + 2] <- 0
} # end of each year loop
} # end of each host loop
#------------------------------------------------------------------------------
#------------------------------------------------------------------------------
### now do some plotting
pdf("medals.pdf", width = 12, height = 12)
# change the layout of the plotting window
par(mfrow = c(4, 4))
# loop over each hosting nation
for(i in 1:nrow(final_data)){
# get the current host's data for all years
host_data <- as.numeric(final_data[i, 3:ncol(final_data)])
# what is their mean number of medals won?
host_mean <- mean(host_data)
# plot the data!
plot(years, host_data, xlab = "Year", ylab = "Number of Medals", pch = 19,
type = "b", lwd = 2,
main = paste(hosts$host[i], "–", years[i], sep = ""))
abline(v = final_data$year[i], lty = "dashed", col = "blue", lwd = 1.5)
abline(h = host_mean, lty = "dashed", col = "red", lwd = 1.5)
}
#------------------------------------------------------------------------------
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment