Created
May 9, 2021 13:01
-
-
Save zverok/c574b7a9c42cc17bdc2aa396e3edd21a to your computer and use it in GitHub Desktop.
"Unmunch" (linearize) word list from Hunspell dictionary with the help of Spylls library
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# This is "unmunching" script for Hunspell dictionaries, based on Spylls (full Python port of Hunspell): | |
# https://github.com/zverok/spylls | |
# | |
# "Unmunching" (Hunspell's term) is a process of turning affix-compressed dictionary into plain list | |
# of all language's words. E.g. for English, in the dictionary we have "spell/JSMDRZG" (stem + flags | |
# declaring what suffixes and prefixes it might have), and we can run this script: | |
# | |
# python unmunch.py path/to/en_US spell | |
# | |
# Which will produce this list: | |
# | |
# spell | |
# spell's | |
# spelled | |
# speller | |
# spellers | |
# spelling | |
# spellings | |
# spells | |
# | |
# Running without second argument will unmunch the entire dictionary. | |
# | |
# WARNINGS! | |
# | |
# 1. The script is not extensively tested, just a demo for discussion in https://github.com/zverok/spylls/issues/10 | |
# (see the issue for more discussion) | |
# 2. It doesn't try to produce all possible words for compounding, because the list is potentiall infinite. | |
# | |
import sys | |
from spylls.hunspell.dictionary import Dictionary | |
dictionary = Dictionary.from_files(sys.argv[1]) | |
def unmunch(word, aff): | |
result = set() | |
if aff.FORBIDDENWORD and aff.FORBIDDENWORD in word.flags: | |
return result | |
if not (aff.NEEDAFFIX and aff.NEEDAFFIX in word.flags): | |
result.add(word.stem) | |
suffixes = [ | |
suffix | |
for flag in word.flags | |
for suffix in aff.SFX.get(flag, []) | |
if suffix.cond_regexp.search(word.stem) | |
] | |
prefixes = [ | |
prefix | |
for flag in word.flags | |
for prefix in aff.PFX.get(flag, []) | |
if prefix.cond_regexp.search(word.stem) | |
] | |
for suffix in suffixes: | |
root = word.stem[0:-len(suffix.strip)] if suffix.strip else word.stem | |
suffixed = root + suffix.add | |
if not (aff.NEEDAFFIX and aff.NEEDAFFIX in suffix.flags): | |
result.add(suffixed) | |
secondary_suffixes = [ | |
suffix2 | |
for flag in suffix.flags | |
for suffix2 in aff.SFX.get(flag, []) | |
if suffix2.cond_regexp.search(suffixed) | |
] | |
for suffix2 in secondary_suffixes: | |
root = suffixed[0:-len(suffix2.strip)] if suffix2.strip else suffixed | |
result.add(root + suffix2.add) | |
for prefix in prefixes: | |
root = word.stem[len(prefix.strip):] | |
prefixed = prefix.add + root | |
if not (aff.NEEDAFFIX and aff.NEEDAFFIX in prefix.flags): | |
result.add(prefixed) | |
if prefix.crossproduct: | |
additional_suffixes = [ | |
suffix | |
for flag in prefix.flags | |
for suffix in aff.SFX.get(flag, []) | |
if suffix.crossproduct and not suffix in suffixes and suffix.cond_regexp.search(prefixed) | |
] | |
for suffix in suffixes + additional_suffixes: | |
root = prefixed[0:-len(suffix.strip)] if suffix.strip else prefixed | |
suffixed = root + suffix.add | |
result.add(suffixed) | |
secondary_suffixes = [ | |
suffix2 | |
for flag in suffix.flags | |
for suffix2 in aff.SFX.get(flag, []) | |
if suffix2.crossproduct and suffix2.cond_regexp.search(suffixed) | |
] | |
for suffix2 in secondary_suffixes: | |
root = suffixed[0:-len(suffix2.strip)] if suffix2.strip else suffixed | |
result.add(root + suffix2.add) | |
return result | |
result = set() | |
if len(sys.argv) > 2: | |
lookup = sys.argv[2] | |
print(f"Unmunching only words with stem {lookup}") | |
else: | |
lookup = None | |
print(f"Unmunching the whole dictionary") | |
print('') | |
for word in dictionary.dic.words: | |
if not lookup or word.stem == lookup: | |
if lookup: | |
print(f"Unmunching {word}") | |
result.update(unmunch(word, dictionary.aff)) | |
print('') | |
for word in sorted(result): | |
print(word) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment