Skip to content

Instantly share code, notes, and snippets.

@wvengen
Last active March 3, 2020 12:27
Show Gist options
  • Save wvengen/ea4b810261aa59b65a481f358b3f6cc8 to your computer and use it in GitHub Desktop.
Save wvengen/ea4b810261aa59b65a481f358b3f6cc8 to your computer and use it in GitHub Desktop.
Enriching ASFIS with Wikidata or FishBase
#!/usr/bin/env ruby
#
# We start with the list at:
# http://www.fao.org/fishery/collection/asfis
# Then we query FishBase (and SeaLifeBase) for the ID and NL name.
#
# Before running this script, make sure you have the ASFIS spreadsheet,
# and make sure you've created the CACHEDIR directory (which stores all
# requests, so you can run this again without having to wait for the requests).
#
# More sources:
# http://www.fishbase.org/
# http://www.sealifebase.org/
# http://www.fao.org/fishery/species/search
#
require 'net/http'
require 'nokogiri'
require 'roo'
FAO_SHEET = 'ASFIS_sp_2019.xlsx'
CACHEDIR = './fishbase-cache'
out = CSV.open('asfis_fishbase.csv', 'w')
fao = Roo::Spreadsheet.open(FAO_SHEET)
batch = []
def with_cache(cachekeys)
cachekey = cachekeys.join('-').gsub(/[^-a-z0-9]+/i, '_')
cachefile = File.join(CACHEDIR, cachekey + '.html')
if File.exists?(cachefile)
File.read(cachefile)
else
body = yield
File.write(cachefile, body)
body
end
end
def http_post_form(uri, params, cachekeys)
with_cache(cachekeys) do
sleep 0.5
Net::HTTP.post_form(uri, params).body
end
end
def http_get(uri, cachekeys)
with_cache(cachekeys) do
sleep 0.5
Net::HTTP.get_response(uri).body
end
end
def find_fishbase(s, baseurl = 'https://www.fishbase.se/', idkey='fishbase_id')
result = {}
genus, species = s.split(/\s+/, 2).map(&:strip)
# first get id
search_uri = URI::join(baseurl, '/Nomenclature/ScientificNameSearchList.php')
body = http_post_form(search_uri, {
'crit1_fieldname': 'SYNONYMS.SynGenus',
'crit1_fieldtype': 'CHAR',
'crit2_fieldname': 'SYNONYMS.SynSpecies',
'crit2_fieldtype': 'CHAR',
'crit1_operator': 'EQUAL',
'crit1_value': genus,
'crit2_operator': 'EQUAL',
'crit2_value': species
}, [idkey, 'lookup', genus, species])
if body =~ /SpeciesSummary.php\?ID=(\d+)\b/
result[idkey] = id = $1.to_i
# then get common names
body = http_get(URI::join(baseurl, "/ComNames/CommonNamesList.php?ID=#{id}"), [idkey, 'names', id])
doc = Nokogiri::HTML.parse(body)
if cell = doc.at_css('td:contains("English")')
result['label_en'] = cell.ancestors('tr').first.css('td:first-child').text.strip
end
if cell = doc.at_css('td:contains("Dutch")')
result['label_nl'] = cell.ancestors('tr').first.css('td:first-child').text.strip
end
elsif idkey != 'sealifebase_id'
result = find_fishbase(s, 'https://www.sealifebase.se/', 'sealifebase_id')
end
result
end
def process_row(row, out)
taxocode = row['TAXOCODE']
alpha = row['3A_CODE']
species = row['Scientific_name']
family = row['Family']
order = row['Order'].capitalize
name_en = row['English_name']
w_name_en = nil
w_name_nl = nil
fishbase_id = nil
sealifebase_id = nil
if wds = find_fishbase(species.strip.downcase)
w_name_en = wds['label_en']
w_name_nl = wds['label_nl']
fishbase_id = wds['fishbase_id']
sealifebase_id = wds['sealifebase_id']
w_name_en = nil if w_name_en&.downcase == species.downcase
w_name_nl = nil if w_name_nl&.downcase == species.downcase
end
out << [alpha, taxocode, fishbase_id, sealifebase_id, name_en, w_name_en, w_name_nl, species, family, order]
end
row_count = fao.sheet(0).count
fao.sheet(0).parse(headers: true).each_with_index do |row, i|
next if i == 0
#next unless row['English_name']
STDERR.puts "Row #{i} of #{row_count}" if i%20 == 0
process_row(row, out)
end
#!/usr/bin/env ruby
#
# We start with the list at:
# http://www.fao.org/fishery/collection/asfis
# Save the spreadsheet as ASFIS_sp_2019.xlsx and run this script,
# which queries Wikidata for the Dutch (NL) name and fishbase_id.
# Data is stored in asfis_wikidata.csv.
#
# Note that the Wikidata SPARQL endpoint can sometimes timeout, so
# I'd go for the FishBase approach instead.
#
# More sources:
# https://www.fishbase.org/
# http://www.fao.org/fishery/species/search
#
require 'roo'
require 'sparql/client'
FAO_SHEET = 'ASFIS_sp_2019.xlsx'
BATCH_SIZE = 35
OUT_CSV = 'asfis_wikidata.csv'
def query(sparql, terms)
result = sparql.query(<<-EOS)
PREFIX wd: <http://www.wikidata.org/entity/>
PREFIX wdt: <http://www.wikidata.org/prop/direct/>
PREFIX wikibase: <http://wikiba.se/ontology#>
SELECT
?item
?taxon_name
?label_en
?label_nl
?fishbase_id
WHERE
{
?item wdt:P225 ?taxon_name .
OPTIONAL { ?item wdt:P938 ?fishbase_id . }
SERVICE wikibase:label { bd:serviceParam wikibase:language "en". ?item rdfs:label ?label_en }
SERVICE wikibase:label { bd:serviceParam wikibase:language "nl". ?item rdfs:label ?label_nl. hint:Prior hint:runLast false. }
FILTER(?taxon_name IN (#{terms.map {|t| '"' + t + '"'}.join(',')}))
}
EOS
result.each_solution do |q|
yield(q)
end
end
sparql = SPARQL::Client.new('https://query.wikidata.org/sparql', headers: {'User-Agent': 'QMWikidataGatherer/0.0 (https://www.thequestionmark.org/)'})
out = CSV.open(OUT_CSV, 'w')
fao = Roo::Spreadsheet.open(FAO_SHEET)
batch = []
def process_batch(sparql, batch, out)
all_species = batch.map {|b| b['Scientific_name'] }.uniq
wd_by_species = {}
query(sparql, all_species) do |r|
wd_by_species[r['taxon_name'].to_s] = {
'label_en' => r['label_en'].to_s&.strip,
'label_nl' => r['label_nl'].to_s&.strip,
'fishbase_id' => r['fishbase_id'].to_s&.to_i
}
end
batch.each do |row|
taxocode = row['TAXOCODE']
alpha = row['3A_CODE']
species = row['Scientific_name']
family = row['Family']
order = row['Order'].capitalize
name_en = row['English_name']
w_name_en = nil
w_name_nl = nil
firebase_id = nil
if wds = wd_by_species[species]
w_name_en = wds['label_en']
w_name_nl = wds['label_nl']
fishbase_id = wds['fishbase_id']
w_name_en = nil if w_name_en&.downcase == species.downcase
w_name_nl = nil if w_name_nl&.downcase == species.downcase
end
out << [alpha, taxocode, fishbase_id, name_en, w_name_en, w_name_nl, species, family, order]
end
end
row_count = fao.sheet(0).count
fao.sheet(0).parse(headers: true).each_with_index do |row, i|
next if i == 0
batch << row
if batch.length > BATCH_SIZE
STDERR.puts "Row #{i} of #{row_count}"
process_batch(sparql, batch, out)
batch = []
sleep 1
end
end
process_batch(sparql, batch, out)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment