Last active
March 13, 2018 00:04
-
-
Save wareya/abce9ca092c516493aa36f774f2180b5 to your computer and use it in GitHub Desktop.
search for han characters based on a list of components
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!python | |
# coding=utf-8 | |
# Get ids.txt from https://github.com/cjkvi/cjkvi-ids/ and place it next to this script | |
# ~requires python 3.6 or newer on windows~ | |
# note: depends on the accuracy of ids.txt. for some characters, like 祭, it's pretty bad. | |
# see also: http://www.chise.org/ids-find | |
contains = {} | |
def is_descriptor(c): | |
c = ord(c) | |
if c < 0x2FF0 or c > 0x2FFB: | |
return False | |
else: | |
return True | |
import argparse, sys, re | |
def find_recursive(c, first=True): | |
recurse = set() | |
if c in contains and (first or not args.norecurse): | |
if not first: | |
recurse = recurse | set([c]) | |
for n in contains[c]: | |
recurse = recurse | find_recursive(n, False) | |
else: | |
recurse = recurse | set([c]) | |
return recurse | |
def find_string(string): | |
string = string.strip() | |
sets = [] | |
for char in string: | |
myset = find_recursive(char) | |
sets += [myset] | |
myset = sets[0] | |
for nextset in sets: | |
myset = myset & nextset | |
return myset | |
def force_print(string): | |
sys.stdout.buffer.write(string.encode("utf-8")) | |
if __name__ == "__main__": | |
parser = argparse.ArgumentParser(description="Ideographic Description Sequence tool") | |
parser.add_argument("lookup_char", help="String of components to find in characters.") | |
parser.add_argument("-r", "--reverse", dest="reverse", action="store_true", help="Decompose instead of compose.") | |
parser.add_argument("-n", "--norecurse", dest="norecurse", action="store_true", help="First level of recursion only.") | |
args = parser.parse_args() | |
with open("ids.txt", encoding="utf-8") as file: | |
for s in file: | |
s = re.sub(r"\[[^\]]*\]", "", s) | |
fields = s.split("\t") | |
if len(fields) < 3: | |
continue | |
char = fields[1].strip() | |
# fields 2+ are each particular decompositions | |
ids = "".join(fields[2:]).strip() | |
for c in ids: | |
if is_descriptor(c): | |
continue | |
if c == char: | |
continue | |
if not args.reverse: | |
if c not in contains: | |
contains[c] = set() | |
contains[c].add(char) | |
else: | |
if char not in contains: | |
contains[char] = set() | |
contains[char].add(c) | |
force_print("\n".join(sorted(find_string(args.lookup_char)))) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment