Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Select an option

  • Save arthurwuhoo/ef0ab7b012e8abb48467b7fd9ef39102 to your computer and use it in GitHub Desktop.

Select an option

Save arthurwuhoo/ef0ab7b012e8abb48467b7fd9ef39102 to your computer and use it in GitHub Desktop.
########################################################################
## DAY 6: SCRAPING DATA FROM A WEBPAGE - POST-CLASS EXERCISES
########################################################################
#------------------------------------------------
# EXERCISE 1
#------------------------------------------------
library(rvest)
##getting the population table
population_df <- read_html("https://en.wikipedia.org/wiki/List_of_countries_by_population_(United_Nations)")
population_df <- population_df %>% html_nodes("table") %>% .[[1]] %>% html_table(fill=TRUE)
str(population_df)
##getting the beer table
beer_df <- read_html("https://en.wikipedia.org/wiki/List_of_countries_by_beer_consumption_per_capita")
beer_df <- beer_df %>% html_nodes("table") %>% .[[1]] %>% html_table(fill=TRUE)
str(beer_df)
## we're going to be doing a join across the two datasets.
## we want to make sure that "Country" is the proper id variable to join on.
colnames(population_df)[2] <- "Country"
## making the join
joined_table <- inner_join(population_df, beer_df, by = "Country")
dim(joined_table)
dim(beer_df)
dim(population_df)
## it looks like we should at least suspect 58 rows matched on the output of the join,
## not just 33. so, let's investigate the issue.
# it seems like there are some numbers in brackets (wikipedia's footnotes) that can at the
# ending of a country label. so, let's strip these out from both country columns in both dfs.
library(stringr)
beer_df$Country <- str_replace_all(beer_df$Country,"\\[.+?\\]","")
population_df$Country <- str_replace_all(population_df$Country,"\\[.+?\\]","")
better_joined_table <- inner_join(population_df, beer_df, by = "Country")
dim(better_joined_table) #ah. not quite 58, but at least a lot better.
#------------------------------------------------
# EXERCISE 2 (no database normalization)
#------------------------------------------------
#[30 minutes] Scrape Eventtimiing for the results of the 2015 running of the 25km Stella Royal.
# Store these data in a suitably normalised database.
runners <- read_html("http://www.eventtiming.co.za/resultsracecapture.php?link=378")
runners_df <- (runners %>% html_nodes("table") %>% .[[4]] %>% html_table())
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment