valentinitnelav · February 18, 2018 19:29
diff --git a/Dictionary of Angiosperms.R b/Dictionary of Angiosperms.R
 # =============================================================================================================
 # Scraps the TPL web page for all families of angiosperms and downloads the corresponding *csv files. 
 # Then grabs the unique species names from this pile of data and builds a custom dictionary for a spell checker. 
 # =============================================================================================================

 # _________ Read the family names from the TPL web page (Angiosperms only) _________ #
 # I adapted code from http://www.stat.berkeley.edu/~spector/s133/Readexample.html for parsing through web pages
 thepage <- readLines("http://www.theplantlist.org/1.1/browse/A/")
 # One can notice that family names are in rows of such patern:
 # <i class=\"family\">Piperaceae</i></a>"
 # So, the data that we want is always preceded by the HTML tag "<i class=\"family\">",
 # and followed by "</i></a>". Let's grab all the lines that have that pattern:
 mypattern <- '<i class=\"family\">([^<]*)</i></a>'
 families <- grep(pattern=mypattern, thepage, value=TRUE)
 # Use regex to clean unwanted portions of text
 families <- gsub('.*y\">', "", families) # replace all text ending in y\"> with ""
 families <- gsub('</i></a>', "", families) # replace the patern '</i></a>' with ""

 # _________ Download all csv files from TPL (Angiosperms only) _________ #
 # This will take some time!
 for (family in families) { # for each family in families
  # build the URL path to the csv file
  myURL <- paste0("http://www.theplantlist.org/1.1/browse/A/", family, "/", family, ".csv")
  # create a destination path
  myDestFile <- paste0(getwd(), "/Angiosperms/", family, ".csv")
  # download the *.csv file from the above URL at the destination path
  download.file(url=myURL, destfile=myDestFile)
 }

 # _________ File and Data management _________ #
 # read each csv file and pile them in a big data table
 library(data.table)
 # construct a character vector with the paths of all csv files
 families.path <-  sapply(families,
                         # define a function that for each element (x.fam) of families will construct the path
                         function(x.fam) paste0(getwd(), "/Angiosperms/", x.fam, ".csv"))
 # read each path with fread and store each data table in a list
 my.list <- lapply(families.path, # for each path in families.path (families.path is character vector)
                  # define a function to fread the csv file at that path
                  function(my.path) fread(input = my.path, 
                                          sep = ",", 
                                          header=TRUE, 
                                          strip.white=TRUE, 
                                          stringsAsFactors=FALSE))
 # make a single data.table from the above list
 TPL.Angiosperms <- rbindlist(my.list)
 # Note: consider saving the data table object in case of repeated use
 save(TPL.Angiosperms, file = "TPL.Angiosperms.rda") # save object
 # load(file = "TPL.Angiosperms.rda") # load object
 # or save as csv
 # write.csv(TPL.Angiosperms, "TPL.Angiosperms.csv", row.names=FALSE, col.names=TRUE)

 # _________ Build a dictionary for Hunspell spell checker  _________ #
 # Create a dictionary file with species and genera names.
 # "A dictionary is a text file. It has one word per line, in alphabetical order. 
 # The first line of the file is the number of words. The text file must be in UTF-8 format."
 # from: http://producthelp.sdl.com/sdl_trados_studio_2011/client_en/Setting_Preferences/Check_Spelling/HunspellSpellChecker.htm#Dictionary_Format
 # The file should have an extension *.dic (even if is actually a text file).
 # First get the unique names for genera and species from TPL.Angiosperms data table.
 # You can also select only accepted records or apply other filters; 
 # for that check column 'Taxonomic status in TPL' in TPL.Angiosperms table.
 # For now I want all existing names in a single dictionary file.

 # Store all unique names of genera and species in a single one-column data table
 # Consider also to store each unique species (e.g. "Quercus robur") in one row.
 myGenera <- unique(TPL.Angiosperms[,.(Genus)]) # get unique names of genera (as data table)
 mySp <- unique(TPL.Angiosperms[,.(Species)]) # get unique names of species (as data table)
 myNames <- rbindlist(list(myGenera, mySp)) # merge them in one-column data table
 # Is important for the *.dic file to have the number of words in the first row,
 # therefore, add a row indicating the number of words (rows)
 myNames <- rbindlist(list(
  # the nrow() output should be a list, therfore using list() for rbindlist() to work properly
  list(nrow(myNames)),
  myNames))

 # Write the desired "*.dic" file to HDD with write.table
 dic.path <- paste0(getwd(), "/Angiosperms.dic")
 write.table(myNames, dic.path, quote = FALSE, row.names=FALSE, col.names=FALSE)

 # Write an empty *.aff file (hunspell function will search for such a file 
 # and return an error if not found!).
 # If you want to set rules in your *.aff file, then things get more complicated, 
 # check this for example: https://www.chromium.org/developers/how-tos/editing-the-spell-checking-dictionaries
 # Here I am fine with no rules and just a simple empty *.aff file:
 file.create(paste0(getwd(), "/Angiosperms.aff")) 

 # ____ Test the custom Angiosperms dictionary ____ #
 # install.packages("hunspell")
 library(hunspell)
 # Take a random sample of 5 species names that don't need correction
 set.seed(1)
 test <- sample(TPL.Angiosperms$Species, 5)
 test
 # [1] "knafii"      "cayennensis" "rupestris"   "hispida"     "dietrichiae"

 # This what happens when using the default English dictionary
 hunspell_suggest(test)

 # Now, use the custom Angiosperms dictionary (just mention the path)
 # No need to mention the *.aff file path, 
 # (hunspell presumes is in the same folder as the *.dic file)
 hunspell_suggest(test, dict = dic.path)
 # This will return the first suggestion only:
 suggested <- hunspell_suggest(test, dict = dic.path) 
 sapply(suggested, "[[", 1)
 # [1] "knafii"      "cayennensis" "rupestris"   "hispida"     "dietrichiae"

 ###
 # One can also check a single word as well:
 # 1) extra space at the end
 hunspell_suggest("knafii ", dict = dic.path)
 # [1] "knafii"    "knafianum"

 # 2) double letter
 hunspell_suggest("nnicaraguensis", dict = dic.path)
 # [1] "nicaraguensis"   "nicaraguarensis" "micaraguensis"   "nicaraguense"    "sicariguensis"

 # 3) wrong letter
 hunspell_suggest("Cuercus", dict = dic.path)

 # 4) get only the first suggestion
 hunspell_suggest("nnicaraguensis", dict = dic.path)[[1]][1]
 # [1] "nicaraguensis"


 # _________ Extra: Using Peter Norvig's Spell Checker  _________ #
 # Store all unique names of genera and species in a single large character
 myGenera2 <- paste0(unique(TPL.Angiosperms$Genus), collapse=" ")
 mySp2 <- paste0(unique(TPL.Angiosperms$Species), collapse=" ")
 myNames2 <- paste0(myGenera, mySp, collapse=" ")

 # Run the spell checking function
 # check http://www.sumsar.net/blog/2014/12/peter-norvigs-spell-checker-in-two-lines-of-r/
 sorted_words <- names(sort(table(strsplit(tolower(myNames2), "[^a-z]+")), decreasing = TRUE))
 correct <- function(word) { c(sorted_words[ adist(word, sorted_words) <= min(adist(word, sorted_words), 2)], word)[1] }

 # Some examples
 correct("Zingiberx")
 # [1] "zingiber"
 correct("abcd")
 # [1] "aban"
 correct("Cuercus")
 # [1] "quercus"
 # note that this is slower than the hunspell function!
 # also corrections are always made with lower case
	# =============================================================================================================
	# Scraps the TPL web page for all families of angiosperms and downloads the corresponding *csv files.
	# Then grabs the unique species names from this pile of data and builds a custom dictionary for a spell checker.
	# =============================================================================================================

	# _________ Read the family names from the TPL web page (Angiosperms only) _________ #
	# I adapted code from http://www.stat.berkeley.edu/~spector/s133/Readexample.html for parsing through web pages
	thepage <- readLines("http://www.theplantlist.org/1.1/browse/A/")
	# One can notice that family names are in rows of such patern:
	# <i class=\"family\">Piperaceae</i></a>"
	# So, the data that we want is always preceded by the HTML tag "<i class=\"family\">",
	# and followed by "</i></a>". Let's grab all the lines that have that pattern:
	mypattern <- '<i class=\"family\">([^<]*)</i></a>'
	families <- grep(pattern=mypattern, thepage, value=TRUE)
	# Use regex to clean unwanted portions of text
	families <- gsub('.*y\">', "", families) # replace all text ending in y\"> with ""
	families <- gsub('</i></a>', "", families) # replace the patern '</i></a>' with ""

	# _________ Download all csv files from TPL (Angiosperms only) _________ #
	# This will take some time!
	for (family in families) { # for each family in families
	# build the URL path to the csv file
	myURL <- paste0("http://www.theplantlist.org/1.1/browse/A/", family, "/", family, ".csv")
	# create a destination path
	myDestFile <- paste0(getwd(), "/Angiosperms/", family, ".csv")
	# download the *.csv file from the above URL at the destination path
	download.file(url=myURL, destfile=myDestFile)
	}

	# _________ File and Data management _________ #
	# read each csv file and pile them in a big data table
	library(data.table)
	# construct a character vector with the paths of all csv files
	families.path <- sapply(families,
	# define a function that for each element (x.fam) of families will construct the path
	function(x.fam) paste0(getwd(), "/Angiosperms/", x.fam, ".csv"))
	# read each path with fread and store each data table in a list
	my.list <- lapply(families.path, # for each path in families.path (families.path is character vector)
	# define a function to fread the csv file at that path
	function(my.path) fread(input = my.path,
	sep = ",",
	header=TRUE,
	strip.white=TRUE,
	stringsAsFactors=FALSE))
	# make a single data.table from the above list
	TPL.Angiosperms <- rbindlist(my.list)
	# Note: consider saving the data table object in case of repeated use
	save(TPL.Angiosperms, file = "TPL.Angiosperms.rda") # save object
	# load(file = "TPL.Angiosperms.rda") # load object
	# or save as csv
	# write.csv(TPL.Angiosperms, "TPL.Angiosperms.csv", row.names=FALSE, col.names=TRUE)

	# _________ Build a dictionary for Hunspell spell checker _________ #
	# Create a dictionary file with species and genera names.
	# "A dictionary is a text file. It has one word per line, in alphabetical order.
	# The first line of the file is the number of words. The text file must be in UTF-8 format."
	# from: http://producthelp.sdl.com/sdl_trados_studio_2011/client_en/Setting_Preferences/Check_Spelling/HunspellSpellChecker.htm#Dictionary_Format
	# The file should have an extension *.dic (even if is actually a text file).
	# First get the unique names for genera and species from TPL.Angiosperms data table.
	# You can also select only accepted records or apply other filters;
	# for that check column 'Taxonomic status in TPL' in TPL.Angiosperms table.
	# For now I want all existing names in a single dictionary file.

	# Store all unique names of genera and species in a single one-column data table
	# Consider also to store each unique species (e.g. "Quercus robur") in one row.
	myGenera <- unique(TPL.Angiosperms[,.(Genus)]) # get unique names of genera (as data table)
	mySp <- unique(TPL.Angiosperms[,.(Species)]) # get unique names of species (as data table)
	myNames <- rbindlist(list(myGenera, mySp)) # merge them in one-column data table
	# Is important for the *.dic file to have the number of words in the first row,
	# therefore, add a row indicating the number of words (rows)
	myNames <- rbindlist(list(
	# the nrow() output should be a list, therfore using list() for rbindlist() to work properly
	list(nrow(myNames)),
	myNames))

	# Write the desired "*.dic" file to HDD with write.table
	dic.path <- paste0(getwd(), "/Angiosperms.dic")
	write.table(myNames, dic.path, quote = FALSE, row.names=FALSE, col.names=FALSE)

	# Write an empty *.aff file (hunspell function will search for such a file
	# and return an error if not found!).
	# If you want to set rules in your *.aff file, then things get more complicated,
	# check this for example: https://www.chromium.org/developers/how-tos/editing-the-spell-checking-dictionaries
	# Here I am fine with no rules and just a simple empty *.aff file:
	file.create(paste0(getwd(), "/Angiosperms.aff"))

	# ____ Test the custom Angiosperms dictionary ____ #
	# install.packages("hunspell")
	library(hunspell)
	# Take a random sample of 5 species names that don't need correction
	set.seed(1)
	test <- sample(TPL.Angiosperms$Species, 5)
	test
	# [1] "knafii" "cayennensis" "rupestris" "hispida" "dietrichiae"

	# This what happens when using the default English dictionary
	hunspell_suggest(test)

	# Now, use the custom Angiosperms dictionary (just mention the path)
	# No need to mention the *.aff file path,
	# (hunspell presumes is in the same folder as the *.dic file)
	hunspell_suggest(test, dict = dic.path)
	# This will return the first suggestion only:
	suggested <- hunspell_suggest(test, dict = dic.path)
	sapply(suggested, "[[", 1)
	# [1] "knafii" "cayennensis" "rupestris" "hispida" "dietrichiae"

	###
	# One can also check a single word as well:
	# 1) extra space at the end
	hunspell_suggest("knafii ", dict = dic.path)
	# [1] "knafii" "knafianum"

	# 2) double letter
	hunspell_suggest("nnicaraguensis", dict = dic.path)
	# [1] "nicaraguensis" "nicaraguarensis" "micaraguensis" "nicaraguense" "sicariguensis"

	# 3) wrong letter
	hunspell_suggest("Cuercus", dict = dic.path)

	# 4) get only the first suggestion
	hunspell_suggest("nnicaraguensis", dict = dic.path)[[1]][1]
	# [1] "nicaraguensis"


	# _________ Extra: Using Peter Norvig's Spell Checker _________ #
	# Store all unique names of genera and species in a single large character
	myGenera2 <- paste0(unique(TPL.Angiosperms$Genus), collapse=" ")
	mySp2 <- paste0(unique(TPL.Angiosperms$Species), collapse=" ")
	myNames2 <- paste0(myGenera, mySp, collapse=" ")

	# Run the spell checking function
	# check http://www.sumsar.net/blog/2014/12/peter-norvigs-spell-checker-in-two-lines-of-r/
	sorted_words <- names(sort(table(strsplit(tolower(myNames2), "[^a-z]+")), decreasing = TRUE))
	correct <- function(word) { c(sorted_words[ adist(word, sorted_words) <= min(adist(word, sorted_words), 2)], word)[1] }

	# Some examples
	correct("Zingiberx")
	# [1] "zingiber"
	correct("abcd")
	# [1] "aban"
	correct("Cuercus")
	# [1] "quercus"
	# note that this is slower than the hunspell function!
	# also corrections are always made with lower case