Last active
March 6, 2022 09:13
-
-
Save amake/8987426c54ad79b94619a5f3aa350837 to your computer and use it in GitHub Desktop.
Unravel IDS data
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# unravel.rb | |
# | |
# Expand entries in ids.txt | |
# (https://github.com/cjkvi/cjkvi-ids/blob/master/ids.txt) to make fully | |
# "unraveled" decompositions. | |
# | |
# Usage: ruby unravel.rb ids.txt | |
# | |
# Note that some unravelings are not "valid", in that they contain subcomponents | |
# from different regions and thus represent a decomposition not used in any | |
# region. These are marked with [🙅]. Other bracketed annotations follow the | |
# convention of ids.txt. | |
# @param io [IO] | |
# @return [Hash] | |
def load_db(io) | |
db = {} | |
io.each_line do |line| | |
next if line.start_with?('#') | |
codepoint, char, *decomps = line.split | |
db[char] = [codepoint, decomps.map { |d| Decomp.parse(d, represents: char) }] | |
end | |
db | |
end | |
# @param db [Hash] | |
def unravel!(db) | |
loop do | |
modified = false | |
db.each do |_char, (_codepoint, decomps)| | |
decomps.each do |decomp| | |
modified |= unravel_one!(db, decomp) | |
end | |
end | |
break unless modified | |
end | |
db.transform_values! do |codepoint, decomps| | |
expanded = decomps.flat_map(&:expand) # .select(&:valid?) | |
[codepoint, expanded] | |
end | |
end | |
# @param db [Hash] | |
# @param decomp [Decomp] | |
# @return [Boolean] modified or not | |
def unravel_one!(db, decomp) | |
modified = false | |
decomp.chars.map! do |c| | |
if db.key?(c) && !identity_decomp?(db, c) | |
modified = true | |
sub_decomps = db[c].last | |
sub_decomps.length == 1 ? sub_decomps.first : sub_decomps | |
else | |
c | |
end | |
end | |
modified | |
end | |
# @param db [Hash] | |
# @param c [String] | |
def identity_decomp?(db, c) | |
decomps = db[c].last | |
decomps.length == 1 && decomps.first.chars == [c] | |
end | |
class Decomp | |
class << self | |
# @param str [String] | |
# @param represents [String] | |
# @return [Decomp] | |
def parse(str, represents:) | |
/^(?<decomp>.*?)(?:\[(?<tags>[A-Z]+)\])?$/ =~ str | |
new(decomp.chars, tags&.chars, represents) | |
end | |
end | |
# @return [Array] | |
attr_reader :chars | |
# @return [Array,nil] | |
attr_reader :tags | |
# @return [String] | |
attr_reader :represents | |
# @param chars [Array] | |
# @param tags [Array,nil] | |
# @param represents [String] | |
def initialize(chars, tags, represents) | |
raise 'Tags cannot be empty' if tags && tags.empty? | |
@chars = chars # Don't freeze | |
@tags = tags.freeze | |
@represents = represents.freeze | |
end | |
def inspect | |
to_s(debug: true) | |
end | |
def to_s(include_tags = true, debug: false) | |
parts = chars.map { |c| c.is_a?(Decomp) ? c.to_s(false, debug: debug) : c } | |
parts << "(#{tags.join})" if tags && debug | |
if include_tags | |
eff_tags = effective_tags | |
if eff_tags.is_a?(Array) | |
tag_part = eff_tags.empty? ? '🙅' : eff_tags.join | |
parts << (debug ? "(#{tag_part})" : "[#{tag_part}]") | |
end | |
end | |
s = parts.join | |
debug ? "{#{s}}" : s | |
end | |
def ==(other) | |
other.class == Decomp && | |
chars == other.chars && tags == other.tags && | |
represents == other.represents | |
end | |
# @return [Boolean] whether this decomposition is "expanded", meaning it | |
# represents a single decomposition with no "branches" | |
def expanded? | |
chars.none? { |c| c.is_a?(Array) || (c.is_a?(Decomp) && !c.expanded?) } | |
end | |
# @return [Array<Decomp>] list of expanded decompositions | |
def expand | |
return [self] if expanded? | |
results = [[]] | |
chars.each do |c| | |
case c | |
when Array | |
results = c.flat_map { |c_| c_.is_a?(Decomp) ? c_.expand : c_ } | |
.flat_map { |c_| results.map { |r| r + [c_] } } | |
when Decomp | |
results = c.expand.flat_map { |c_| results.map { |r| r + [c_] } } | |
else | |
results.each { |r| r << c } | |
end | |
end | |
results.flat_map { |r| Decomp.new(r, tags, represents).expand } # .select(&:valid?) | |
end | |
# @return [Array,nil] nil means untagged; empty array means no valid set of | |
# tags can apply (this decomposition is invalid) | |
def effective_tags(acc = { result: nil }) | |
raise unless expanded? | |
if tags | |
acc[:result] ||= tags.dup | |
acc[:result].select! { |t| tags.include?(t) } | |
end | |
chars.each { |c| c.effective_tags(acc) if c.is_a?(Decomp) } | |
acc[:result] | |
end | |
# @return [Boolean] whether this decomposition represents a consistent set of | |
# regional variations | |
def valid? | |
eff_tags = effective_tags | |
eff_tags.nil? || eff_tags.any? | |
end | |
end | |
db = load_db(ARGF) | |
unravel!(db) | |
db.each do |char, (codepoint, decomps)| | |
puts([codepoint, char, decomps.map { |d| d.to_s(debug: false) }].join("\t")) | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment