Last active
March 3, 2020 12:27
-
-
Save wvengen/ea4b810261aa59b65a481f358b3f6cc8 to your computer and use it in GitHub Desktop.
Enriching ASFIS with Wikidata or FishBase
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env ruby | |
# | |
# We start with the list at: | |
# http://www.fao.org/fishery/collection/asfis | |
# Then we query FishBase (and SeaLifeBase) for the ID and NL name. | |
# | |
# Before running this script, make sure you have the ASFIS spreadsheet, | |
# and make sure you've created the CACHEDIR directory (which stores all | |
# requests, so you can run this again without having to wait for the requests). | |
# | |
# More sources: | |
# http://www.fishbase.org/ | |
# http://www.sealifebase.org/ | |
# http://www.fao.org/fishery/species/search | |
# | |
require 'net/http' | |
require 'nokogiri' | |
require 'roo' | |
FAO_SHEET = 'ASFIS_sp_2019.xlsx' | |
CACHEDIR = './fishbase-cache' | |
out = CSV.open('asfis_fishbase.csv', 'w') | |
fao = Roo::Spreadsheet.open(FAO_SHEET) | |
batch = [] | |
def with_cache(cachekeys) | |
cachekey = cachekeys.join('-').gsub(/[^-a-z0-9]+/i, '_') | |
cachefile = File.join(CACHEDIR, cachekey + '.html') | |
if File.exists?(cachefile) | |
File.read(cachefile) | |
else | |
body = yield | |
File.write(cachefile, body) | |
body | |
end | |
end | |
def http_post_form(uri, params, cachekeys) | |
with_cache(cachekeys) do | |
sleep 0.5 | |
Net::HTTP.post_form(uri, params).body | |
end | |
end | |
def http_get(uri, cachekeys) | |
with_cache(cachekeys) do | |
sleep 0.5 | |
Net::HTTP.get_response(uri).body | |
end | |
end | |
def find_fishbase(s, baseurl = 'https://www.fishbase.se/', idkey='fishbase_id') | |
result = {} | |
genus, species = s.split(/\s+/, 2).map(&:strip) | |
# first get id | |
search_uri = URI::join(baseurl, '/Nomenclature/ScientificNameSearchList.php') | |
body = http_post_form(search_uri, { | |
'crit1_fieldname': 'SYNONYMS.SynGenus', | |
'crit1_fieldtype': 'CHAR', | |
'crit2_fieldname': 'SYNONYMS.SynSpecies', | |
'crit2_fieldtype': 'CHAR', | |
'crit1_operator': 'EQUAL', | |
'crit1_value': genus, | |
'crit2_operator': 'EQUAL', | |
'crit2_value': species | |
}, [idkey, 'lookup', genus, species]) | |
if body =~ /SpeciesSummary.php\?ID=(\d+)\b/ | |
result[idkey] = id = $1.to_i | |
# then get common names | |
body = http_get(URI::join(baseurl, "/ComNames/CommonNamesList.php?ID=#{id}"), [idkey, 'names', id]) | |
doc = Nokogiri::HTML.parse(body) | |
if cell = doc.at_css('td:contains("English")') | |
result['label_en'] = cell.ancestors('tr').first.css('td:first-child').text.strip | |
end | |
if cell = doc.at_css('td:contains("Dutch")') | |
result['label_nl'] = cell.ancestors('tr').first.css('td:first-child').text.strip | |
end | |
elsif idkey != 'sealifebase_id' | |
result = find_fishbase(s, 'https://www.sealifebase.se/', 'sealifebase_id') | |
end | |
result | |
end | |
def process_row(row, out) | |
taxocode = row['TAXOCODE'] | |
alpha = row['3A_CODE'] | |
species = row['Scientific_name'] | |
family = row['Family'] | |
order = row['Order'].capitalize | |
name_en = row['English_name'] | |
w_name_en = nil | |
w_name_nl = nil | |
fishbase_id = nil | |
sealifebase_id = nil | |
if wds = find_fishbase(species.strip.downcase) | |
w_name_en = wds['label_en'] | |
w_name_nl = wds['label_nl'] | |
fishbase_id = wds['fishbase_id'] | |
sealifebase_id = wds['sealifebase_id'] | |
w_name_en = nil if w_name_en&.downcase == species.downcase | |
w_name_nl = nil if w_name_nl&.downcase == species.downcase | |
end | |
out << [alpha, taxocode, fishbase_id, sealifebase_id, name_en, w_name_en, w_name_nl, species, family, order] | |
end | |
row_count = fao.sheet(0).count | |
fao.sheet(0).parse(headers: true).each_with_index do |row, i| | |
next if i == 0 | |
#next unless row['English_name'] | |
STDERR.puts "Row #{i} of #{row_count}" if i%20 == 0 | |
process_row(row, out) | |
end |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env ruby | |
# | |
# We start with the list at: | |
# http://www.fao.org/fishery/collection/asfis | |
# Save the spreadsheet as ASFIS_sp_2019.xlsx and run this script, | |
# which queries Wikidata for the Dutch (NL) name and fishbase_id. | |
# Data is stored in asfis_wikidata.csv. | |
# | |
# Note that the Wikidata SPARQL endpoint can sometimes timeout, so | |
# I'd go for the FishBase approach instead. | |
# | |
# More sources: | |
# https://www.fishbase.org/ | |
# http://www.fao.org/fishery/species/search | |
# | |
require 'roo' | |
require 'sparql/client' | |
FAO_SHEET = 'ASFIS_sp_2019.xlsx' | |
BATCH_SIZE = 35 | |
OUT_CSV = 'asfis_wikidata.csv' | |
def query(sparql, terms) | |
result = sparql.query(<<-EOS) | |
PREFIX wd: <http://www.wikidata.org/entity/> | |
PREFIX wdt: <http://www.wikidata.org/prop/direct/> | |
PREFIX wikibase: <http://wikiba.se/ontology#> | |
SELECT | |
?item | |
?taxon_name | |
?label_en | |
?label_nl | |
?fishbase_id | |
WHERE | |
{ | |
?item wdt:P225 ?taxon_name . | |
OPTIONAL { ?item wdt:P938 ?fishbase_id . } | |
SERVICE wikibase:label { bd:serviceParam wikibase:language "en". ?item rdfs:label ?label_en } | |
SERVICE wikibase:label { bd:serviceParam wikibase:language "nl". ?item rdfs:label ?label_nl. hint:Prior hint:runLast false. } | |
FILTER(?taxon_name IN (#{terms.map {|t| '"' + t + '"'}.join(',')})) | |
} | |
EOS | |
result.each_solution do |q| | |
yield(q) | |
end | |
end | |
sparql = SPARQL::Client.new('https://query.wikidata.org/sparql', headers: {'User-Agent': 'QMWikidataGatherer/0.0 (https://www.thequestionmark.org/)'}) | |
out = CSV.open(OUT_CSV, 'w') | |
fao = Roo::Spreadsheet.open(FAO_SHEET) | |
batch = [] | |
def process_batch(sparql, batch, out) | |
all_species = batch.map {|b| b['Scientific_name'] }.uniq | |
wd_by_species = {} | |
query(sparql, all_species) do |r| | |
wd_by_species[r['taxon_name'].to_s] = { | |
'label_en' => r['label_en'].to_s&.strip, | |
'label_nl' => r['label_nl'].to_s&.strip, | |
'fishbase_id' => r['fishbase_id'].to_s&.to_i | |
} | |
end | |
batch.each do |row| | |
taxocode = row['TAXOCODE'] | |
alpha = row['3A_CODE'] | |
species = row['Scientific_name'] | |
family = row['Family'] | |
order = row['Order'].capitalize | |
name_en = row['English_name'] | |
w_name_en = nil | |
w_name_nl = nil | |
firebase_id = nil | |
if wds = wd_by_species[species] | |
w_name_en = wds['label_en'] | |
w_name_nl = wds['label_nl'] | |
fishbase_id = wds['fishbase_id'] | |
w_name_en = nil if w_name_en&.downcase == species.downcase | |
w_name_nl = nil if w_name_nl&.downcase == species.downcase | |
end | |
out << [alpha, taxocode, fishbase_id, name_en, w_name_en, w_name_nl, species, family, order] | |
end | |
end | |
row_count = fao.sheet(0).count | |
fao.sheet(0).parse(headers: true).each_with_index do |row, i| | |
next if i == 0 | |
batch << row | |
if batch.length > BATCH_SIZE | |
STDERR.puts "Row #{i} of #{row_count}" | |
process_batch(sparql, batch, out) | |
batch = [] | |
sleep 1 | |
end | |
end | |
process_batch(sparql, batch, out) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment