Skip to content

Instantly share code, notes, and snippets.

@valentinitnelav
Last active February 18, 2018 19:29
Show Gist options
  • Save valentinitnelav/ccb9b59f33ca4c6d22a6d5100a06f9ca to your computer and use it in GitHub Desktop.
Save valentinitnelav/ccb9b59f33ca4c6d22a6d5100a06f9ca to your computer and use it in GitHub Desktop.
# =============================================================================================================
# Scraps the TPL web page for all families of angiosperms and downloads the corresponding *csv files.
# Then grabs the unique species names from this pile of data and builds a custom dictionary for a spell checker.
# =============================================================================================================
# _________ Read the family names from the TPL web page (Angiosperms only) _________ #
# I adapted code from http://www.stat.berkeley.edu/~spector/s133/Readexample.html for parsing through web pages
thepage <- readLines("http://www.theplantlist.org/1.1/browse/A/")
# One can notice that family names are in rows of such patern:
# <i class=\"family\">Piperaceae</i></a>"
# So, the data that we want is always preceded by the HTML tag "<i class=\"family\">",
# and followed by "</i></a>". Let's grab all the lines that have that pattern:
mypattern <- '<i class=\"family\">([^<]*)</i></a>'
families <- grep(pattern=mypattern, thepage, value=TRUE)
# Use regex to clean unwanted portions of text
families <- gsub('.*y\">', "", families) # replace all text ending in y\"> with ""
families <- gsub('</i></a>', "", families) # replace the patern '</i></a>' with ""
# _________ Download all csv files from TPL (Angiosperms only) _________ #
# This will take some time!
for (family in families) { # for each family in families
# build the URL path to the csv file
myURL <- paste0("http://www.theplantlist.org/1.1/browse/A/", family, "/", family, ".csv")
# create a destination path
myDestFile <- paste0(getwd(), "/Angiosperms/", family, ".csv")
# download the *.csv file from the above URL at the destination path
download.file(url=myURL, destfile=myDestFile)
}
# _________ File and Data management _________ #
# read each csv file and pile them in a big data table
library(data.table)
# construct a character vector with the paths of all csv files
families.path <- sapply(families,
# define a function that for each element (x.fam) of families will construct the path
function(x.fam) paste0(getwd(), "/Angiosperms/", x.fam, ".csv"))
# read each path with fread and store each data table in a list
my.list <- lapply(families.path, # for each path in families.path (families.path is character vector)
# define a function to fread the csv file at that path
function(my.path) fread(input = my.path,
sep = ",",
header=TRUE,
strip.white=TRUE,
stringsAsFactors=FALSE))
# make a single data.table from the above list
TPL.Angiosperms <- rbindlist(my.list)
# Note: consider saving the data table object in case of repeated use
save(TPL.Angiosperms, file = "TPL.Angiosperms.rda") # save object
# load(file = "TPL.Angiosperms.rda") # load object
# or save as csv
# write.csv(TPL.Angiosperms, "TPL.Angiosperms.csv", row.names=FALSE, col.names=TRUE)
# _________ Build a dictionary for Hunspell spell checker _________ #
# Create a dictionary file with species and genera names.
# "A dictionary is a text file. It has one word per line, in alphabetical order.
# The first line of the file is the number of words. The text file must be in UTF-8 format."
# from: http://producthelp.sdl.com/sdl_trados_studio_2011/client_en/Setting_Preferences/Check_Spelling/HunspellSpellChecker.htm#Dictionary_Format
# The file should have an extension *.dic (even if is actually a text file).
# First get the unique names for genera and species from TPL.Angiosperms data table.
# You can also select only accepted records or apply other filters;
# for that check column 'Taxonomic status in TPL' in TPL.Angiosperms table.
# For now I want all existing names in a single dictionary file.
# Store all unique names of genera and species in a single one-column data table
# Consider also to store each unique species (e.g. "Quercus robur") in one row.
myGenera <- unique(TPL.Angiosperms[,.(Genus)]) # get unique names of genera (as data table)
mySp <- unique(TPL.Angiosperms[,.(Species)]) # get unique names of species (as data table)
myNames <- rbindlist(list(myGenera, mySp)) # merge them in one-column data table
# Is important for the *.dic file to have the number of words in the first row,
# therefore, add a row indicating the number of words (rows)
myNames <- rbindlist(list(
# the nrow() output should be a list, therfore using list() for rbindlist() to work properly
list(nrow(myNames)),
myNames))
# Write the desired "*.dic" file to HDD with write.table
dic.path <- paste0(getwd(), "/Angiosperms.dic")
write.table(myNames, dic.path, quote = FALSE, row.names=FALSE, col.names=FALSE)
# Write an empty *.aff file (hunspell function will search for such a file
# and return an error if not found!).
# If you want to set rules in your *.aff file, then things get more complicated,
# check this for example: https://www.chromium.org/developers/how-tos/editing-the-spell-checking-dictionaries
# Here I am fine with no rules and just a simple empty *.aff file:
file.create(paste0(getwd(), "/Angiosperms.aff"))
# ____ Test the custom Angiosperms dictionary ____ #
# install.packages("hunspell")
library(hunspell)
# Take a random sample of 5 species names that don't need correction
set.seed(1)
test <- sample(TPL.Angiosperms$Species, 5)
test
# [1] "knafii" "cayennensis" "rupestris" "hispida" "dietrichiae"
# This what happens when using the default English dictionary
hunspell_suggest(test)
# Now, use the custom Angiosperms dictionary (just mention the path)
# No need to mention the *.aff file path,
# (hunspell presumes is in the same folder as the *.dic file)
hunspell_suggest(test, dict = dic.path)
# This will return the first suggestion only:
suggested <- hunspell_suggest(test, dict = dic.path)
sapply(suggested, "[[", 1)
# [1] "knafii" "cayennensis" "rupestris" "hispida" "dietrichiae"
###
# One can also check a single word as well:
# 1) extra space at the end
hunspell_suggest("knafii ", dict = dic.path)
# [1] "knafii" "knafianum"
# 2) double letter
hunspell_suggest("nnicaraguensis", dict = dic.path)
# [1] "nicaraguensis" "nicaraguarensis" "micaraguensis" "nicaraguense" "sicariguensis"
# 3) wrong letter
hunspell_suggest("Cuercus", dict = dic.path)
# 4) get only the first suggestion
hunspell_suggest("nnicaraguensis", dict = dic.path)[[1]][1]
# [1] "nicaraguensis"
# _________ Extra: Using Peter Norvig's Spell Checker _________ #
# Store all unique names of genera and species in a single large character
myGenera2 <- paste0(unique(TPL.Angiosperms$Genus), collapse=" ")
mySp2 <- paste0(unique(TPL.Angiosperms$Species), collapse=" ")
myNames2 <- paste0(myGenera, mySp, collapse=" ")
# Run the spell checking function
# check http://www.sumsar.net/blog/2014/12/peter-norvigs-spell-checker-in-two-lines-of-r/
sorted_words <- names(sort(table(strsplit(tolower(myNames2), "[^a-z]+")), decreasing = TRUE))
correct <- function(word) { c(sorted_words[ adist(word, sorted_words) <= min(adist(word, sorted_words), 2)], word)[1] }
# Some examples
correct("Zingiberx")
# [1] "zingiber"
correct("abcd")
# [1] "aban"
correct("Cuercus")
# [1] "quercus"
# note that this is slower than the hunspell function!
# also corrections are always made with lower case
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment