Last active
February 21, 2024 11:47
-
-
Save lynn/07f7ce3c314223d2aca19ec2bb0540cd to your computer and use it in GitHub Desktop.
Inject rules into the rikaichan/rikaikun `deinflect.dat` file
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# usage: python patch-deinflect.py | |
# (reads deinflect.dat in same folder, writes to new-deinflect.dat) | |
import fileinput | |
# Bitmask values | |
CONJ_RU_VERB = 0x0001 | |
CONJ_U_VERB = 0x0002 | |
CONJ_I_ADJ = 0x0004 | |
CONJ_KURU = 0x0008 | |
CONJ_SURU = 0x0010 | |
CONJ_OTHER = 0x0080 | |
BASE_RU_VERB = 0x0100 | |
BASE_U_VERB = 0x0200 | |
BASE_I_ADJ = 0x0400 | |
BASE_KURU = 0x0800 | |
BASE_SURU = 0x1000 | |
class Rule: | |
"""A rule in a rikaichan deinflect rules file.""" | |
def __init__(self, conj, base, mask, form_index): | |
self.conj = conj | |
self.base = base | |
self.mask = int(mask) | |
self.form_index = int(form_index) | |
def order(self): | |
"""Return a value that acts as a key for ordering rules.""" | |
return (-len(self.conj), self.conj) | |
def build(self): | |
"""Build a string representation of this rule.""" | |
return '{}\t{}\t{}\t{}'.format(self.conj, self.base, self.mask, self.form_index) | |
class Deinflect: | |
"""The contents of a rikaichan deinflect rules file.""" | |
def __init__(self, line_iter): | |
"""Parse a deinflect rules file from an iterator of lines.""" | |
self.header = None | |
self.forms = [] | |
self.rules = [] | |
for line in line_iter: | |
line = line.strip() | |
fields = line.split('\t') | |
if 'Deinflect Rules' in line: | |
self.header = line | |
elif len(fields) == 1: | |
self.forms.append(line) | |
elif len(fields) == 4: | |
self.rules.append(Rule(*fields)) | |
else: | |
raise ValueError('invalid deinflect line') | |
def build(self): | |
"""Yield lines forming a deinflect rules file.""" | |
yield self.header | |
for form in self.forms: | |
yield form | |
for rule in sorted(self.rules, key=Rule.order): | |
yield rule.build() | |
def form_index(self, form): | |
""" | |
Return the index of a form in the forms list. | |
If the given form is new, it is added to the list, and the new index is returned. | |
""" | |
try: | |
return self.forms.index(form) | |
except ValueError: | |
index = len(self.forms) | |
self.forms.append(form) | |
return index | |
if __name__ == '__main__': | |
d = Deinflect(open('deinflect.dat', encoding='utf-8')) | |
te_index = d.form_index('-te') | |
te_rules = [rule for rule in d.rules if rule.form_index == te_index] | |
# https://en.wikipedia.org/wiki/Japanese_verb_conjugation#Usage_3 | |
te_helpers = [ | |
('いる', 'progressive', CONJ_RU_VERB), | |
('る', 'progressive', CONJ_RU_VERB), | |
('おる', 'progressive', CONJ_U_VERB), | |
('おく', 'preparatory', CONJ_U_VERB), | |
('ある', 'resultant', CONJ_RU_VERB), | |
('しまう', '-te shimau', CONJ_U_VERB), | |
('みる', 'try', CONJ_RU_VERB), | |
('いく', 'go', CONJ_U_VERB), | |
('行く', 'go', CONJ_U_VERB), | |
('く', 'go', CONJ_U_VERB), | |
('くる', 'come', CONJ_KURU), | |
('来る', 'come', CONJ_KURU), | |
('ください', 'please do', CONJ_OTHER), | |
('はいけない', 'no good', CONJ_OTHER), | |
('もいい', "it's OK to", CONJ_OTHER), | |
('もよかった', "it was OK to", CONJ_OTHER), | |
('も良い', "it's OK to", CONJ_OTHER), | |
('も良かった', "it was OK to", CONJ_OTHER), | |
('もかまわない', "don't mind if", CONJ_I_ADJ), | |
('も構わない', "don't mind if", CONJ_I_ADJ), | |
('もかまいません', "don't mind if", CONJ_I_ADJ), | |
('も構いません', "don't mind if", CONJ_I_ADJ), | |
('ほしい', "I want you to", CONJ_I_ADJ), | |
('欲しい', "I want you to", CONJ_I_ADJ), | |
('すみません', 'sorry for', CONJ_OTHER), | |
('くれてありがとう', 'thanks for', CONJ_OTHER), | |
('くれる', 'favor to me', CONJ_RU_VERB), | |
('あげる', 'favor to other', CONJ_RU_VERB), | |
('もらう', 'receive favor', CONJ_U_VERB), | |
('いただく', 'receive favor', CONJ_U_VERB), | |
] | |
for (verb, form, bit) in te_helpers: | |
fi = d.form_index(form) | |
for rule in te_rules: | |
new_mask = rule.mask & ~0xFF | bit | |
new_rule = Rule(rule.conj + verb, rule.base, new_mask, fi) | |
d.rules.append(new_rule) | |
with open('new-deinflect.dat', 'wb') as f: | |
f.write('\n'.join(k for k in d.build()).encode('utf-8')) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment