Last active
June 2, 2016 14:02
-
-
Save ibartomeus/9bbbb605c8e391fc863c8f5bd08dde77 to your computer and use it in GitHub Desktop.
OriginR demo
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#This is a quick demo performed for Sevilla R users group | |
#Elena asked about IUCN data. You can retrive this data using taxize: https://github.com/ropensci/taxize | |
#Elena also suggested vectorbase can be scrapped. Maybe for the next hackaton? | |
#package OriginR | |
library(originr) | |
#define species (don't worry about typos) | |
sp <- c("Apis mellifera", "carpobrotus edulis", "Lavandula stoechas", | |
"Anchusa arvensis", "Alkana lutea", "Rhododendron ponticum", | |
"Anopheles gambiae", "Lantana camara", "aedes aegipty") | |
#we load a wrapper for taxize functions that check for synonims, misspellings, etc... | |
source("https://gist.githubusercontent.com/ibartomeus/b61be59e317d0a1e213814b2f1a1c776/raw/6a676ce854a15a7428133d30ffbfbb2d719e06b3/clean_species") | |
#and clean data | |
sp2 <- clean_species(sp) | |
sp2 | |
sp <- as.character(sp2$matched_name2) | |
#Is one of the worst invaders? | |
?gisd | |
gisd(sp, simplify = FALSE) | |
gisd(sp, simplify = TRUE) | |
#Invasive lists are incomplete by nature. Ley's flip the question | |
?is_native | |
is_native(sp[1], where = "Continental US", region = "america") | |
is_native(sp[3], where = "Continental US", region = "america") | |
lapply(sp[c(-1,-4)], is_native, where = "Continental US", region = "america") | |
is_native(sp[6], where = "Spain", region = "europe") | |
is_native(sp[6], where = "Britain", region = "europe") | |
lapply(sp, is_native, where = "Spain", region = "europe") | |
#is_native calls flora europaea function for plants | |
flora_europaea(sp) | |
sapply(sp, flora_europaea, simplify = FALSE) | |
#EXPLORE a webscrapping FUNCTION: | |
sp = "Anopheles gambiae" | |
#the workload is done by packages httr and xml2 | |
library(httr) | |
library(xml2) | |
flora_europaea | |
#---inside the function flora_europaea | |
#First we get the url and the parameters needed | |
genus <- strsplit(sp, " ")[[1]][1] | |
species <- strsplit(sp, " ")[[1]][2] | |
url <- "http://rbg-web2.rbge.org.uk/cgi-bin/nph-readbtree.pl/feout" | |
args <- list(FAMILY_XREF = "", GENUS_XREF = genus, SPECIES_XREF = species, | |
TAXON_NAME_XREF = "", RANK = "") | |
mssg(verbose, paste("Checking", sp)) | |
url_check <- GET(url, query = args) | |
warn_for_status(url_check) | |
#then we fetch the url html code | |
doc <- xml2::read_html(content(url_check, "text", encoding = "UTF-8"), | |
encoding = "UTF-8") | |
tables <- xml2::xml_find_all(doc, "//table") | |
#here we already retrieved the html code (doc) and tables (tables) and | |
#we know that the info we want is on the 3rd table | |
if (length(tables) < 3) { | |
mssg(verbose, "Species not found") | |
NULL | |
} | |
else { | |
text <- xml_text(tables[[3]], trim = FALSE) | |
#here we use regular expresions to extract just the Distribution data | |
#it's long because you need to separate native, alien, | |
#doubtful and extinct distributions see ?grep for info on how | |
#regular expresions work. | |
if (!grepl("Distribution:", text, perl = TRUE)) { | |
mssg(verbose, "Species with no distribution. Probably not native.") | |
} | |
else { | |
m_nat <- regexpr("Distribution: [A-Za-z ()?*%,]*", | |
text, perl = TRUE) | |
distr_nat <- regmatches(text, m_nat) | |
distr_status <- regmatches(distr_nat, gregexpr("[*][A-Z][a-z]", | |
distr_nat, perl = TRUE)) | |
distr_occ <- regmatches(distr_nat, gregexpr("[?][A-Z][a-z]", | |
distr_nat, perl = TRUE)) | |
distr_ext <- regmatches(distr_nat, gregexpr("[%][A-Z][a-z]", | |
distr_nat, perl = TRUE)) | |
distr_nat <- gsub(",", " ", distr_nat) | |
distr_nat <- gsub("(", " ", distr_nat, fixed = TRUE) | |
distr_nat <- gsub(")", "", distr_nat, fixed = TRUE) | |
distr_nat <- gsub("Distribution: ", "", distr_nat) | |
nat = exo = stat = oc = ex = NA | |
if (distr_nat != "") { | |
native <- strsplit(distr_nat, " ")[[1]] | |
delete <- which(!native %in% country$short) | |
if (length(delete) > 0) | |
native <- native[-delete] | |
nat <- sapply(native, function(x) { | |
country[which(x == country$short), "long"] | |
#This just converts short country names to long ones based on a tesaurus | |
#The tesaurus is not included in this gist, see the package for it. | |
}) | |
} | |
if (length(distr_status[[1]]) > 0) { | |
status <- gsub("*", "", distr_status[[1]], fixed = TRUE) | |
stat <- sapply(status, function(x) { | |
country[which(x == country$short), "long"] | |
}) | |
} | |
if (length(distr_occ[[1]]) > 0) { | |
occ <- gsub("?", "", distr_occ[[1]], fixed = TRUE) | |
oc <- sapply(occ, function(x) { | |
country[which(x == country$short), "long"] | |
}) | |
} | |
if (length(distr_ext[[1]]) > 0) { | |
ext <- gsub("%", "", distr_ext[[1]], fixed = TRUE) | |
ex <- sapply(ext, function(x) { | |
country[which(x == country$short), "long"] | |
}) | |
} | |
m_ex <- regexpr("[[][A-Za-z ()?*%,]*", text, perl = TRUE) | |
distr_exot <- regmatches(text, m_ex) | |
if (length(distr_exot) > 0) { | |
exotic <- strsplit(gsub("[", "", distr_exot, | |
fixed = TRUE), " ")[[1]] | |
exo <- sapply(exotic, function(x) { | |
country[which(x == country$short), "long"] | |
}) | |
} | |
#return all values. | |
list(native = as.character(nat), exotic = as.character(exo), | |
status_doubtful = as.character(stat), occurrence_doubtful = as.character(oc), | |
extinct = as.character(ex)) | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment