Created
December 5, 2016 10:57
-
-
Save yoursdearboy/132a56e79869f7e5e42de8bb93f6eeee to your computer and use it in GitHub Desktop.
Class and script to make json/csv from ICF classificator
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env ruby | |
# As source you need HTML file from http://apps.who.int/classifications/icfbrowser/Default.aspx | |
# with unfolded tree. | |
# It uses Ajax to fetch nested entries, so unfold each manually or you can use JS console to autoclick. | |
require 'optparse' | |
require 'nokogiri' | |
require 'json' | |
require 'csv' | |
class ICFEntry < Struct.new(:id, :code, :title, :children) | |
class << self; | |
def make(src) | |
make_node(src.css('> div')).first | |
end | |
private | |
@@last_id = 0 | |
def gen_id | |
@@last_id += 1 | |
end | |
def make_node(node) | |
tables = node.children.select{|c| c.name == 'table'} | |
tables.map do |tbl| | |
id = gen_id # Make id first for order | |
content = tbl.text.strip | |
matches = content.match /(\S\d*)\s(.*)/ | |
code,title = matches.nil? ? [nil,content] : matches.captures | |
i = node.children.index(tbl) | |
ni = i + 1 | |
children = if node.children.length > ni | |
div = node.children[ni] | |
if div.name == 'div' | |
make_node(div) | |
end | |
end | |
ICFEntry.new(id, code, title, children) | |
end | |
end | |
end | |
def to_h | |
out = {code: code, title: title} | |
out = out.merge({children: children.map{|c| c.to_h}}) unless children.nil? | |
out | |
end | |
def to_sh | |
if children.nil? | |
code_title | |
else | |
{title: code_title, children: children.map{|c| c.to_sh}} | |
end | |
end | |
def to_arr(parent=nil, acc=[]) | |
out = [code, title, parent.nil? ? nil : parent.id] | |
acc << out | |
unless children.nil? | |
children.each do |child| | |
child.to_arr(parent=self, acc=acc) | |
end | |
end | |
acc | |
end | |
def code_title | |
"#{code} #{title}" | |
end | |
def to_csv() | |
CSV.generate(write_headers: true, headers: ["code", "title", "parent"]) do |csv| | |
to_arr.each do |entry| | |
csv << entry | |
end | |
end | |
end | |
def to_json(short=false) | |
if short | |
to_sh.to_json | |
else | |
to_h.to_json | |
end | |
end | |
end | |
options = {short: false} | |
options_parser = OptionParser.new do |opts| | |
opts.banner = "Usage: convert.rb [options] html-source-file" | |
opts.on("-t [TO]", "--to [TO]", [:csv,:json], "Output format") do |v| | |
options[:to] = v | |
end | |
opts.on("-s", "--short", "Use short json representation") do |v| | |
options[:short] = true | |
end | |
end | |
options_parser.parse! | |
filename = ARGV.pop | |
unless filename | |
puts options_parser | |
exit(1) | |
end | |
TREE_ID = "ctl00_ContentPlaceHolder1_tree" | |
src = Nokogiri::HTML(open(filename)) | |
src = src.css("##{TREE_ID}").first | |
dict = ICFEntry.make src | |
if options[:to] == :csv | |
puts dict.to_csv | |
elsif options[:to] == :json | |
puts dict.to_json(options[:short]) | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment