Created
October 25, 2016 12:24
-
-
Save dsager/8b9ba414e92778177bda515bf4af6127 to your computer and use it in GitHub Desktop.
world-bank theme classifier
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# | |
# script to load project data from the world bank and generate a training set | |
# for a classifier | |
# | |
# Get a list of projects from the world bank and filter out projects w/o abstracts & themes | |
# | |
# curl -s 'http://search.worldbank.org/api/v2/projects?format=json&fl=project_name,project_abstract,theme_namecode&source=IBRD&rows=50000' | |
# | jq '[.projects[] | select(.project_abstract? and .theme_namecode?)] | map({"text": [.project_name, .project_abstract.cdata] | join(" - "), "themes": .theme_namecode | map(.code)})' | |
# > wb-projects.json | |
# | |
require 'yaml' | |
require 'json' | |
require 'csv' | |
themes = {} | |
YAML.load_file('config/data/wb_themes.yml').each do |code, theme| | |
theme['children'].each do |sub_code, sub_theme| | |
themes[sub_code.to_i] = { | |
name: "[#{sub_code}] #{sub_theme['name']}", | |
full_name: "[#{code}] #{theme['name']}/[#{sub_code}] #{sub_theme['name']}", | |
count: 0, | |
samples: [] | |
} | |
end | |
end | |
wb_projects = JSON.load(File.open('tmp/wb-projects-2.json')) | |
puts "available samples: #{wb_projects.count}" | |
wb_projects.each do |project| | |
project['themes'].each do |theme_code| | |
if themes[theme_code.to_i] && themes[theme_code.to_i][:count] < 300 | |
themes[theme_code.to_i][:count] += 1 | |
themes[theme_code.to_i][:samples] << project['text'] | |
end | |
end | |
end | |
puts 'samples per theme:' | |
themes.sort_by { |_, v| v[:count] }.each do |_, v| | |
print v[:name].ljust(80) | |
print '| ' | |
print v[:count].to_s.ljust(5) | |
print '| ' | |
print ('.' * (v[:count] / 10).to_i).ljust(32) | |
puts '' | |
end | |
puts 'writing training set to CSV file' | |
CSV.open('tmp/wb_themes_training_set.csv', 'wb') do |csv| | |
themes.each do |_, theme| | |
theme[:samples].each do |sample| | |
csv << [sample, theme[:full_name]] | |
end | |
end | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment