Last active
July 31, 2022 08:32
-
-
Save kueda/0d84165aba157563cade to your computer and use it in GitHub Desktop.
Script to parse Esslinger's A Cumulative Checklist for the Lichen-forming, Lichenicolous and Allied Fungi of the Continental United States and Canada into machine-readable CSV
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#encoding: utf-8 | |
# | |
# Script to parse Esslinger's A Cumulative Checklist for the Lichen-forming, | |
# Lichenicolous and Allied Fungi of the Continental United States and Canada | |
# into machine-readable CSV. | |
# | |
# Esslinger's checklist (e.g. | |
# http://www.ndsu.edu/pubweb/~esslinge/chcklst/chcklst7.htm) is considered | |
# authoritative for North American lichens, but it's authored with MS Word and | |
# has incosistent formatting. This script attempts to smooth that out and | |
# makes CSV suitable for machine processing. | |
# | |
# Usage: | |
# | |
# ruby esslinger.rb "https://www.ndsu.edu/pubweb/~esslinge/chcklst/chcklst7.htm" | |
# | |
require 'rubygems' | |
require 'open-uri' | |
require 'biodiversity' | |
require 'nokogiri' | |
require 'csv' | |
def parse(node) | |
# these sometimes contain space chars with weird encodings | |
if node['style'] =~ /mso-spacerun/ | |
" " | |
elsif node.children.size > 0 | |
txt = node.children.map{|c| parse(c)}.join('') | |
txt = "<b>#{txt}</b>" if %w(strong b).include?( node.name.downcase ) | |
txt += "<br>" if node.name == 'p' | |
txt | |
elsif node.name == 'br' | |
"<br>" | |
elsif node.name == 'a' | |
"" | |
else | |
node.inner_text.gsub(/\s+/, ' ') | |
end | |
end | |
start = Time.now | |
url = ARGV[0] | |
html = Nokogiri.HTML(open(url), nil, "UTF-8") | |
paragraphs = html.search('p') | |
text = paragraphs.map{ |p| | |
parsed = parse(p) | |
parsed =~ /^\s*$/ ? nil : parsed | |
}.compact.join("") | |
parser = ScientificNameParser.new | |
synonyms = [%w(synonym verbatim current current_verbatim)] | |
names = [%w(name canonical verbatim lichenization)] | |
failures = [] | |
genus = nil | |
stop_words = [ | |
"record", | |
"report", | |
"in eastern", | |
"apparently", | |
"many old", | |
"this name", | |
"treated", | |
"probably", | |
"excluded", | |
"north america", | |
"misidentification", | |
"not known", | |
"type not", | |
"identity uncertain", | |
"but not", | |
"erroneously listed", | |
"a european", | |
"may not", | |
"identity not" | |
] | |
previous_was_synonym = false | |
text.split("<br>").each do |line| | |
next if line.strip.size == 0 | |
# The following means we're done with names and on to the citations | |
break if line =~ /appendix.*specimen citations/i | |
line = line.sub(/Syns?\.?\:.+$/, '').gsub(/\s+/, ' ').strip | |
puts | |
puts line.inspect | |
# genus is all caps with at least 4 letters | |
if new_genus = line[/^(<b>)?([A-Z]{4,})/, 2] | |
genus = new_genus.capitalize | |
puts "\tnew genus: #{genus}" | |
previous_was_synonym = false | |
next | |
end | |
unless genus | |
failures << line | |
puts "\tNo genus, skipping..." | |
next | |
end | |
# Try to determine if this was a current name based on bolding. Note that this | |
# is imperfect. Esslinger's list contains maddening things like <strong><span | |
# style="color:blue;font-weight:normal;mso-bidi-font- | |
# weight:bold">pinguis</span></strong> which is intended to be a synonym | |
is_current_name = false | |
if line =~ /<b>.+?<\/b>/ | |
is_current_name = true | |
end | |
line = line.gsub( /<\/?b>/, "" ) | |
lichenization = if new_line = line[/^\*(.+)/, 1] | |
line = new_line | |
"lichenicolous" | |
elsif new_line = line[/^\+(.+)/, 1] | |
line = new_line | |
"saprophyte" | |
elsif new_line = line[/^\#(.+)/, 1] | |
line = new_line | |
"uncertain" | |
else | |
"lichen" | |
end | |
if line =~ /=/ | |
old_name, current_verbatim = line.split('=') | |
current_verbatim ||= "UNPARSED" | |
puts "\tSynonym: was #{genus} #{old_name}, now #{current_verbatim}" | |
old_name = "#{genus} #{old_name}".split(/#{stop_words.join('|')}/i)[0] | |
parsed_synonym = begin | |
parser.parse(old_name) | |
rescue NoMethodError | |
nil | |
end | |
current_name = current_verbatim.split( /#{stop_words.join('|')}/i )[0] | |
parsed_current = begin | |
parser.parse( current_name ) | |
rescue NoMethodError | |
nil | |
end | |
synonym = if parsed_synonym && parsed_synonym[:scientificName] && parsed_synonym[:scientificName][:parsed] | |
parsed_synonym[:scientificName][:canonical] | |
else | |
old_name | |
end | |
current = if parsed_current && parsed_current[:scientificName] && parsed_current[:scientificName][:parsed] | |
canonical = parsed_current[:scientificName][:canonical] | |
canonical.sub( /^#{genus[0]}\./, genus ) | |
else | |
"UNPARSED" | |
end | |
synonyms << [synonym, old_name, current, current_verbatim] | |
previous_was_synonym = true | |
next | |
end | |
next if previous_was_synonym | |
next unless is_current_name | |
# Esslinger often adds extra annotations after the authority that screw | |
# things up, so this is a lame way to deal with them | |
line = line.split(/#{stop_words.join('|')}/i)[0] | |
name = "#{genus} #{line}" | |
puts "\tName: #{name}" | |
begin | |
if (parsed_name = parser.parse(name)) && parsed_name[:scientificName] && parsed_name[:scientificName][:parsed] | |
puts "\tCanonical: #{parsed_name[:scientificName][:canonical]}" | |
if parsed_name[:scientificName][:canonical].strip == genus | |
puts "\tSpecies was blank, skipping..." | |
failures << line | |
next | |
end | |
names << [parsed_name[:scientificName][:normalized], parsed_name[:scientificName][:canonical], name, lichenization] | |
end | |
rescue NoMethodError => e | |
failures << line | |
puts "\tFailed to parse scientific name, skipping..." | |
end | |
previous_was_synonym = false | |
end | |
synonyms_filename = "esslinger.#{File.basename(url)}.synonyms.csv" | |
CSV.open(synonyms_filename, 'w') do |csv| | |
synonyms.each do |line| | |
csv << line.map(&:strip) | |
end | |
end | |
names_filename = "esslinger.#{File.basename(url)}.names.csv" | |
CSV.open(names_filename, 'w') do |csv| | |
names.each do |line| | |
csv << line.map(&:strip) | |
end | |
end | |
puts | |
puts "#{failures.size} failed lines:" | |
failures.each{|line| puts "\t#{line}"} | |
puts | |
puts "Parsed #{names.size} names, #{synonyms.size} synonyms, #{failures.size} failures in #{Time.now - start} s" | |
puts "Names written to #{names_filename}" | |
puts "Synonyms written to #{synonyms_filename}" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment