Created
June 9, 2016 12:55
-
-
Save arthurwuhoo/ef0ab7b012e8abb48467b7fd9ef39102 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| ######################################################################## | |
| ## DAY 6: SCRAPING DATA FROM A WEBPAGE - POST-CLASS EXERCISES | |
| ######################################################################## | |
| #------------------------------------------------ | |
| # EXERCISE 1 | |
| #------------------------------------------------ | |
| library(rvest) | |
| ##getting the population table | |
| population_df <- read_html("https://en.wikipedia.org/wiki/List_of_countries_by_population_(United_Nations)") | |
| population_df <- population_df %>% html_nodes("table") %>% .[[1]] %>% html_table(fill=TRUE) | |
| str(population_df) | |
| ##getting the beer table | |
| beer_df <- read_html("https://en.wikipedia.org/wiki/List_of_countries_by_beer_consumption_per_capita") | |
| beer_df <- beer_df %>% html_nodes("table") %>% .[[1]] %>% html_table(fill=TRUE) | |
| str(beer_df) | |
| ## we're going to be doing a join across the two datasets. | |
| ## we want to make sure that "Country" is the proper id variable to join on. | |
| colnames(population_df)[2] <- "Country" | |
| ## making the join | |
| joined_table <- inner_join(population_df, beer_df, by = "Country") | |
| dim(joined_table) | |
| dim(beer_df) | |
| dim(population_df) | |
| ## it looks like we should at least suspect 58 rows matched on the output of the join, | |
| ## not just 33. so, let's investigate the issue. | |
| # it seems like there are some numbers in brackets (wikipedia's footnotes) that can at the | |
| # ending of a country label. so, let's strip these out from both country columns in both dfs. | |
| library(stringr) | |
| beer_df$Country <- str_replace_all(beer_df$Country,"\\[.+?\\]","") | |
| population_df$Country <- str_replace_all(population_df$Country,"\\[.+?\\]","") | |
| better_joined_table <- inner_join(population_df, beer_df, by = "Country") | |
| dim(better_joined_table) #ah. not quite 58, but at least a lot better. | |
| #------------------------------------------------ | |
| # EXERCISE 2 (no database normalization) | |
| #------------------------------------------------ | |
| #[30 minutes] Scrape Eventtimiing for the results of the 2015 running of the 25km Stella Royal. | |
| # Store these data in a suitably normalised database. | |
| runners <- read_html("http://www.eventtiming.co.za/resultsracecapture.php?link=378") | |
| runners_df <- (runners %>% html_nodes("table") %>% .[[4]] %>% html_table()) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment