Last active
March 8, 2016 20:33
-
-
Save sjkiss/2853524ea6d2ad5ebd6f to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#Load two libraries | |
library(xml2) | |
library(rvest) | |
#Paste the digit 1 thru 12 into the base URL to create 12 separate URLs, one each for a batch | |
urls<-lapply(seq(1,12, 1), function(x) paste('http://www.chemicalsubstanceschimiques.gc.ca/challenge-defi/batch-lot-',x,'/index-eng.php', sep='')) | |
#Check | |
urls | |
#read the html code from each batch home page | |
batches<-lapply(urls, function(x) read_html(x)) | |
#Get each table that contains the phrase 'that meet'. This is because the tables containing the substances that are declared toxic are all headed with the phrase "Substances that meet one or more criteria in section 64 of the Canadian Environmental Protection Act, 1999" | |
#Store these in the list toxics | |
toxics<-lapply(batches, function(x) html_nodes(x, xpath=".//table/caption[contains(., 'that meet')]/..")) | |
#Replicate this command with the phrase 'that do not meet' because tables that list the non-toxic substances contain the phrase | |
#Substances that do not meet any of the criteria in section 64 of the Canadian Environmental Protection Act, 1999 | |
#Store these | |
nontoxics<-lapply(batches, function(x) html_nodes(x, xpath=".//table/caption[contains(., 'that do not meet')]/..")) | |
#get rows of tables | |
#This returns all rows in each table. | |
number_toxics<-lapply(toxics, function(x) xml_find_all(x, './/tr')) | |
#count rows | |
#This counts the number of rows | |
number_toxics<-lapply(number_toxics, function(x) length(x)) | |
#subract 1 to account for the header in each table | |
#This subtracts one from each number of rows | |
number_toxics<-unlist(number_toxics)-1 | |
#Correct last value | |
#There were no substances found toxic in the final batch so this should be set to zero | |
number_toxics[12]<-0 | |
#sum and store | |
number_toxics<-sum(number_toxics) | |
number_toxics | |
#The following steps replicate the preceeding for the tables of the non_toxic substances | |
#non_toxics | |
number_non_toxics<-lapply(nontoxics, function(x) xml_find_all(x, './/tr')) | |
number_non_toxics | |
#count rows | |
number_non_toxics<-lapply(number_non_toxics, function(x) length(x)) | |
number_non_toxics | |
#subtract1 | |
number_non_toxics<-unlist(number_non_toxics)-1 | |
number_non_toxics<-sum(number_non_toxics) | |
#take the number of toxic substances as a proportion of all the toxic substances and the non-toxic substances. | |
number_toxics/sum(number_toxics+number_non_toxics) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment