arthurwuhoo · June 9, 2016 12:55
diff --git a/DAY 6: Scraping from the Web Post-Class Exercises.R b/DAY 6: Scraping from the Web Post-Class Exercises.R
 ########################################################################

 ## DAY 6: SCRAPING DATA FROM A WEBPAGE - POST-CLASS EXERCISES

 ########################################################################

 #------------------------------------------------
 # EXERCISE 1
 #------------------------------------------------

 library(rvest)
 ##getting the population table
 population_df <- read_html("https://en.wikipedia.org/wiki/List_of_countries_by_population_(United_Nations)")
 population_df <- population_df %>% html_nodes("table") %>% .[[1]] %>% html_table(fill=TRUE)
 str(population_df)

 ##getting the beer table
 beer_df <- read_html("https://en.wikipedia.org/wiki/List_of_countries_by_beer_consumption_per_capita")
 beer_df <- beer_df %>% html_nodes("table") %>% .[[1]] %>% html_table(fill=TRUE)
 str(beer_df)

 ## we're going to be doing a join across the two datasets.
 ## we want to make sure that "Country" is the proper id variable to join on.

 colnames(population_df)[2] <- "Country"

 ## making the join
 joined_table <- inner_join(population_df, beer_df, by = "Country")
 dim(joined_table)
 dim(beer_df)
 dim(population_df)

 ## it looks like we should at least suspect 58 rows matched on the output of the join,
 ## not just 33. so, let's investigate the issue.

 # it seems like there are some numbers in brackets (wikipedia's footnotes) that can at the
 # ending of a country label. so, let's strip these out from both country columns in both dfs.

 library(stringr)

 beer_df$Country <- str_replace_all(beer_df$Country,"\\[.+?\\]","")
 population_df$Country <- str_replace_all(population_df$Country,"\\[.+?\\]","")
 better_joined_table <- inner_join(population_df, beer_df, by = "Country")

 dim(better_joined_table) #ah. not quite 58, but at least a lot better.

 #------------------------------------------------
 # EXERCISE 2 (no database normalization)
 #------------------------------------------------

 #[30 minutes] Scrape Eventtimiing for the results of the 2015 running of the 25km Stella Royal. 
 # Store these data in a suitably normalised database.

 runners <- read_html("http://www.eventtiming.co.za/resultsracecapture.php?link=378")
 runners_df <- (runners %>% html_nodes("table") %>% .[[4]] %>% html_table())
	########################################################################

	## DAY 6: SCRAPING DATA FROM A WEBPAGE - POST-CLASS EXERCISES

	########################################################################

	#------------------------------------------------
	# EXERCISE 1
	#------------------------------------------------

	library(rvest)
	##getting the population table
	population_df <- read_html("https://en.wikipedia.org/wiki/List_of_countries_by_population_(United_Nations)")
	population_df <- population_df %>% html_nodes("table") %>% .[[1]] %>% html_table(fill=TRUE)
	str(population_df)

	##getting the beer table
	beer_df <- read_html("https://en.wikipedia.org/wiki/List_of_countries_by_beer_consumption_per_capita")
	beer_df <- beer_df %>% html_nodes("table") %>% .[[1]] %>% html_table(fill=TRUE)
	str(beer_df)

	## we're going to be doing a join across the two datasets.
	## we want to make sure that "Country" is the proper id variable to join on.

	colnames(population_df)[2] <- "Country"

	## making the join
	joined_table <- inner_join(population_df, beer_df, by = "Country")
	dim(joined_table)
	dim(beer_df)
	dim(population_df)

	## it looks like we should at least suspect 58 rows matched on the output of the join,
	## not just 33. so, let's investigate the issue.

	# it seems like there are some numbers in brackets (wikipedia's footnotes) that can at the
	# ending of a country label. so, let's strip these out from both country columns in both dfs.

	library(stringr)

	beer_df$Country <- str_replace_all(beer_df$Country,"\\[.+?\\]","")
	population_df$Country <- str_replace_all(population_df$Country,"\\[.+?\\]","")
	better_joined_table <- inner_join(population_df, beer_df, by = "Country")

	dim(better_joined_table) #ah. not quite 58, but at least a lot better.

	#------------------------------------------------
	# EXERCISE 2 (no database normalization)
	#------------------------------------------------

	#[30 minutes] Scrape Eventtimiing for the results of the 2015 running of the 25km Stella Royal.
	# Store these data in a suitably normalised database.

	runners <- read_html("http://www.eventtiming.co.za/resultsracecapture.php?link=378")
	runners_df <- (runners %>% html_nodes("table") %>% .[[4]] %>% html_table())
No results found