Created
October 27, 2014 09:51
-
-
Save ksomemo/102a1e9bb733972f3e88 to your computer and use it in GitHub Desktop.
形態素一覧から、辞書にマッチする単語を作成する(最長を優先)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # -*- coding: utf-8 -*- | |
| import pytest | |
| def extract_matching_word(keep_words): | |
| if len(keep_words) == 0: | |
| return None | |
| words = map(lambda (k,v): (k, ''.join(v), len(v)), keep_words.iteritems()) | |
| fullwords = filter(lambda v: v[0] == v[1], words) | |
| max_len = max(map(lambda v: v[2], fullwords)) | |
| max_len_words = filter(lambda v: v[2] == max_len, fullwords) | |
| if len(max_len_words) > 0: | |
| da = map(lambda v: v[0], max_len_words) | |
| if len(da) > 0: | |
| return da[0] | |
| return None | |
| def make_matching_words(morphemes_words, extract_dict): | |
| words = [] | |
| keep = {} | |
| for w in morphemes_words: | |
| if len(keep) == 0: | |
| # マッチしていない場合、辞書から見つける | |
| keep = {k: [w] for k, v in extract_dict.iteritems() if extract_dict[k][0] == w} | |
| else: | |
| # マッチ済みから探す | |
| match = False | |
| for k in keep: | |
| if len(extract_dict[k]) > len(keep[k]) and extract_dict[k][len(keep[k])] == w: | |
| # 合った | |
| keep[k].append(w) | |
| match = True | |
| if not match: | |
| # 今までの最高の単語数マッチ | |
| matching_word = extract_matching_word(keep) | |
| if matching_word: | |
| words.append(matching_word) | |
| keep = {k: [w] for k, v in extract_dict.iteritems() if extract_dict[k][0] == w} | |
| matching_word = extract_matching_word(keep) | |
| if matching_word: | |
| words.append(matching_word) | |
| return words | |
| class TestFunc(object): | |
| @pytest.mark.parametrize(("input", "expected"), [ | |
| (['b'], []), | |
| (['b', 'a'], ['a']), | |
| (['b', 'a', 'd'], ['a']), | |
| (['b', 'a', 'd', 'a'], ['a', 'a']), | |
| (['b', 'a', 'd', 'a', 'b'], ['a', 'ab']), | |
| (['b', 'a', 'd', 'a', 'b', 'c'], ['a', 'ab', 'c']), | |
| ]) | |
| def test_make_matching_words(self, input, expected): | |
| extract_dict = { | |
| 'c': ['c'], | |
| 'a': ['a'], | |
| 'ab': ['a', 'b'], | |
| } | |
| assert make_matching_words(input, extract_dict) == expected | |
| if __name__ == '__main__': | |
| pytest.main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment