Last active
February 18, 2018 19:29
-
-
Save valentinitnelav/ccb9b59f33ca4c6d22a6d5100a06f9ca to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# ============================================================================================================= | |
# Scraps the TPL web page for all families of angiosperms and downloads the corresponding *csv files. | |
# Then grabs the unique species names from this pile of data and builds a custom dictionary for a spell checker. | |
# ============================================================================================================= | |
# _________ Read the family names from the TPL web page (Angiosperms only) _________ # | |
# I adapted code from http://www.stat.berkeley.edu/~spector/s133/Readexample.html for parsing through web pages | |
thepage <- readLines("http://www.theplantlist.org/1.1/browse/A/") | |
# One can notice that family names are in rows of such patern: | |
# <i class=\"family\">Piperaceae</i></a>" | |
# So, the data that we want is always preceded by the HTML tag "<i class=\"family\">", | |
# and followed by "</i></a>". Let's grab all the lines that have that pattern: | |
mypattern <- '<i class=\"family\">([^<]*)</i></a>' | |
families <- grep(pattern=mypattern, thepage, value=TRUE) | |
# Use regex to clean unwanted portions of text | |
families <- gsub('.*y\">', "", families) # replace all text ending in y\"> with "" | |
families <- gsub('</i></a>', "", families) # replace the patern '</i></a>' with "" | |
# _________ Download all csv files from TPL (Angiosperms only) _________ # | |
# This will take some time! | |
for (family in families) { # for each family in families | |
# build the URL path to the csv file | |
myURL <- paste0("http://www.theplantlist.org/1.1/browse/A/", family, "/", family, ".csv") | |
# create a destination path | |
myDestFile <- paste0(getwd(), "/Angiosperms/", family, ".csv") | |
# download the *.csv file from the above URL at the destination path | |
download.file(url=myURL, destfile=myDestFile) | |
} | |
# _________ File and Data management _________ # | |
# read each csv file and pile them in a big data table | |
library(data.table) | |
# construct a character vector with the paths of all csv files | |
families.path <- sapply(families, | |
# define a function that for each element (x.fam) of families will construct the path | |
function(x.fam) paste0(getwd(), "/Angiosperms/", x.fam, ".csv")) | |
# read each path with fread and store each data table in a list | |
my.list <- lapply(families.path, # for each path in families.path (families.path is character vector) | |
# define a function to fread the csv file at that path | |
function(my.path) fread(input = my.path, | |
sep = ",", | |
header=TRUE, | |
strip.white=TRUE, | |
stringsAsFactors=FALSE)) | |
# make a single data.table from the above list | |
TPL.Angiosperms <- rbindlist(my.list) | |
# Note: consider saving the data table object in case of repeated use | |
save(TPL.Angiosperms, file = "TPL.Angiosperms.rda") # save object | |
# load(file = "TPL.Angiosperms.rda") # load object | |
# or save as csv | |
# write.csv(TPL.Angiosperms, "TPL.Angiosperms.csv", row.names=FALSE, col.names=TRUE) | |
# _________ Build a dictionary for Hunspell spell checker _________ # | |
# Create a dictionary file with species and genera names. | |
# "A dictionary is a text file. It has one word per line, in alphabetical order. | |
# The first line of the file is the number of words. The text file must be in UTF-8 format." | |
# from: http://producthelp.sdl.com/sdl_trados_studio_2011/client_en/Setting_Preferences/Check_Spelling/HunspellSpellChecker.htm#Dictionary_Format | |
# The file should have an extension *.dic (even if is actually a text file). | |
# First get the unique names for genera and species from TPL.Angiosperms data table. | |
# You can also select only accepted records or apply other filters; | |
# for that check column 'Taxonomic status in TPL' in TPL.Angiosperms table. | |
# For now I want all existing names in a single dictionary file. | |
# Store all unique names of genera and species in a single one-column data table | |
# Consider also to store each unique species (e.g. "Quercus robur") in one row. | |
myGenera <- unique(TPL.Angiosperms[,.(Genus)]) # get unique names of genera (as data table) | |
mySp <- unique(TPL.Angiosperms[,.(Species)]) # get unique names of species (as data table) | |
myNames <- rbindlist(list(myGenera, mySp)) # merge them in one-column data table | |
# Is important for the *.dic file to have the number of words in the first row, | |
# therefore, add a row indicating the number of words (rows) | |
myNames <- rbindlist(list( | |
# the nrow() output should be a list, therfore using list() for rbindlist() to work properly | |
list(nrow(myNames)), | |
myNames)) | |
# Write the desired "*.dic" file to HDD with write.table | |
dic.path <- paste0(getwd(), "/Angiosperms.dic") | |
write.table(myNames, dic.path, quote = FALSE, row.names=FALSE, col.names=FALSE) | |
# Write an empty *.aff file (hunspell function will search for such a file | |
# and return an error if not found!). | |
# If you want to set rules in your *.aff file, then things get more complicated, | |
# check this for example: https://www.chromium.org/developers/how-tos/editing-the-spell-checking-dictionaries | |
# Here I am fine with no rules and just a simple empty *.aff file: | |
file.create(paste0(getwd(), "/Angiosperms.aff")) | |
# ____ Test the custom Angiosperms dictionary ____ # | |
# install.packages("hunspell") | |
library(hunspell) | |
# Take a random sample of 5 species names that don't need correction | |
set.seed(1) | |
test <- sample(TPL.Angiosperms$Species, 5) | |
test | |
# [1] "knafii" "cayennensis" "rupestris" "hispida" "dietrichiae" | |
# This what happens when using the default English dictionary | |
hunspell_suggest(test) | |
# Now, use the custom Angiosperms dictionary (just mention the path) | |
# No need to mention the *.aff file path, | |
# (hunspell presumes is in the same folder as the *.dic file) | |
hunspell_suggest(test, dict = dic.path) | |
# This will return the first suggestion only: | |
suggested <- hunspell_suggest(test, dict = dic.path) | |
sapply(suggested, "[[", 1) | |
# [1] "knafii" "cayennensis" "rupestris" "hispida" "dietrichiae" | |
### | |
# One can also check a single word as well: | |
# 1) extra space at the end | |
hunspell_suggest("knafii ", dict = dic.path) | |
# [1] "knafii" "knafianum" | |
# 2) double letter | |
hunspell_suggest("nnicaraguensis", dict = dic.path) | |
# [1] "nicaraguensis" "nicaraguarensis" "micaraguensis" "nicaraguense" "sicariguensis" | |
# 3) wrong letter | |
hunspell_suggest("Cuercus", dict = dic.path) | |
# 4) get only the first suggestion | |
hunspell_suggest("nnicaraguensis", dict = dic.path)[[1]][1] | |
# [1] "nicaraguensis" | |
# _________ Extra: Using Peter Norvig's Spell Checker _________ # | |
# Store all unique names of genera and species in a single large character | |
myGenera2 <- paste0(unique(TPL.Angiosperms$Genus), collapse=" ") | |
mySp2 <- paste0(unique(TPL.Angiosperms$Species), collapse=" ") | |
myNames2 <- paste0(myGenera, mySp, collapse=" ") | |
# Run the spell checking function | |
# check http://www.sumsar.net/blog/2014/12/peter-norvigs-spell-checker-in-two-lines-of-r/ | |
sorted_words <- names(sort(table(strsplit(tolower(myNames2), "[^a-z]+")), decreasing = TRUE)) | |
correct <- function(word) { c(sorted_words[ adist(word, sorted_words) <= min(adist(word, sorted_words), 2)], word)[1] } | |
# Some examples | |
correct("Zingiberx") | |
# [1] "zingiber" | |
correct("abcd") | |
# [1] "aban" | |
correct("Cuercus") | |
# [1] "quercus" | |
# note that this is slower than the hunspell function! | |
# also corrections are always made with lower case |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment