Last active
April 6, 2017 22:12
-
-
Save rma2015/feee2ec1cc4450d91d50 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
from enum import Enum | |
from itertools import chain | |
import re | |
import pprint | |
filepath = "chukchi_texts.txt" | |
#pprint.pprint(words) | |
def suffix(word, affix_recessive, affix_dominant = "\0"): | |
return word.endswith(affix_recessive) or word.endswith(affix_dominant) | |
def prefix(word, affix_recessive, affix_dominant = "\0"): | |
return word.startswith(affix_recessive) or word.startswith(affix_dominant) | |
def in_affix(word, affix_recessive, affix_dominant = "\0"): | |
return affix_recessive in word or affix_dominant in word | |
verb_prefixes = [ | |
{'mət', | |
'mən', | |
'mənʔ', | |
'ʔən', | |
'nʔ', | |
'mʔ', | |
'ne', | |
'na', | |
'ɣe', | |
'ɣa', | |
't', | |
'm', | |
'n', | |
'q',}, | |
{'re', 'ra', 'r'}, | |
{'ine', | |
'ena', | |
'in', | |
'en'}, | |
{'r', 'n'} | |
] | |
verb_suffixes = [{'et', 'at', 'ew', 'aw'}, | |
{'tku', 'tko'}, | |
{'ŋŋo', 'pɬətku'}, | |
{'ɣe','ɣi', 'nin', 'nen', 'ɬin', 'ɬen', 'qin', 'qen', 'jɣəm', | |
'qenat', 'ninet', 'ɬinet', 'ɬenat', 'nenat', 'qinet', | |
'rkəni', 'rkəne', 'ɣʔe', 'ɣʔa', 'ɣʔi', 'rk'}, | |
{'n', 't', 'ɣəm','ɣət','muri','more','turi','tore', 'k', | |
'rkən'}] | |
verb_endings_require_prefix = {'k': True, | |
'nin': False, | |
'nen': False, | |
'ɬinet': False, | |
'ɬenat': False, | |
'ɬin': False, | |
'ɬen': False, | |
'qin': True, | |
'qen': True, | |
'qinet': True, | |
'qenat': True, | |
'jɣəm': True, | |
'ɣe': False, | |
'ɣi': False, | |
'ninet': False, | |
'nenat': False, | |
't': True, | |
'ɣəm': True, | |
'ɣət': True, | |
'muri': True, | |
'more': True, | |
'turi': True, | |
'tore': True, | |
'ɣʔe': False, | |
'ɣʔa': False, | |
'rkən': False, | |
'rkəne': False, | |
'rkəni': False} | |
noun_prefixes = [{'ɣe', 'ɣa'}, | |
{'taŋ', 'mejŋ'}] | |
noun_suffixes = [{'ɬʔ', 'ɣərɣ'}, | |
{'cɣ'}, | |
{'cəku', 'cəko','jekwe', 'curm', 'corm'}, | |
{'n', 't', 'ŋə', 'te', 'ta', 'e', 'a','etə', | |
'ɣtə', 'jpə', 'ɣəpə', 'epə', 'ɣjit', 'ɣjet', | |
'nu', 'no', 'k', 'jikwi', 'jekwe', 'cəku', 'cəko'}] | |
all_verb_prefixes = set(chain.from_iterable(verb_prefixes)) | |
all_verb_suffixes = set(chain.from_iterable(verb_suffixes)) | |
all_noun_prefixes = set(chain.from_iterable(noun_prefixes)) | |
all_noun_suffixes = set(chain.from_iterable(noun_suffixes)) | |
only_noun_prefixes = all_noun_prefixes.difference(all_verb_prefixes) | |
only_noun_suffixes = all_noun_suffixes.difference(all_verb_suffixes) | |
all_prefixes = set(chain(all_verb_prefixes, all_noun_prefixes)) | |
all_suffixes = set(chain(all_verb_suffixes, all_noun_suffixes)) | |
verb_prefix_glosses = [{'t': '1SG.S/A.IND', | |
'mət': '1PL.S/A.IND', | |
'm': '1SG.S/A.INT', | |
'n': '3.S/A.INT', | |
'q': '2.S/A.INT', | |
'mən': '1PL.S/A.INT', | |
'mʔ': '1SG.S/A.COND', | |
'mənʔ': '1PL.S/A.COND', | |
'nʔ': 'N1.S/A.COND', | |
'ne': '3A', | |
'na': '3A', | |
'ʔən': '3A.INT'}, | |
{'re': 'FUT', 'ra': 'FUT', 'r': 'FUT'}, | |
{'ine': '1SG.O', | |
'ena': '1SG.O', | |
'in': '1SG.O', | |
'en': '1SG.O'}, | |
{'r': 'CS', 'n': 'CS'}] | |
verb_suffix_glosses = [{'et': 'TH', 'at': 'TH', 'ew': 'TH', 'aw': 'TH'}, | |
{'k': '1SG.S/A.IND//INF', 'nin': '3SG.S>3SG.O', 'nen':'3SG.S>3SG.O', | |
'ɣʔe': 'TH', 'ɣʔa': 'TH', 'tək': '2PL', 'tkə': '2PL.A>3.O', | |
'ɣəm': '1SG.S/A','ɣət': '2SG.S/A','muri': '1PL.S/A','more': '1PL.S/A', | |
'turi': '2PL.S/A','tore': '2PL.S/A'}] | |
noun_prefix_glosses = [{'ɣe':'COM', 'ɣa':'COM'}, | |
{'taŋ':'INTS'}] | |
noun_suffix_glosses = [{'ŋə':'ABS', 'te': 'ERG', 'ta':'ERG', 'e':'ERG', 'a':'ERG', | |
'jtə':'ALL', 'ɣtə':'ALL', 'jpə':'ABL', 'ɣəpə':'ABL', | |
'ɣpə':'ABL', 'ɣjit':'ORIENT', 'ɣjet':'ORIENT', 'nu':'EQU', 'no':'EQU'}] | |
# figure out which affixes only occur in verbs and which only in nouns | |
# we will use this later to determine whether the word is a verb or a noun | |
class WordType(Enum): | |
other = 0 | |
noun = 1 | |
verb = 2 | |
adjective = 3 | |
def prefix_in_word(word, prefixes, must_start=False): | |
if must_start: | |
for prefix in prefixes: | |
if word.startswith(prefix): | |
return True | |
else: | |
return False | |
longest_prefix = max(prefixes, key=len) | |
max_length = min(len(word), len(longest_prefix)) | |
word_section = word[:max_length] | |
for prefix in prefixes: | |
if prefix in word_section: | |
return True | |
else: | |
return False | |
def suffix_in_word(word, suffixes, must_end=False): | |
if must_end: | |
for suffix in suffixes: | |
if word.endswith(suffix): | |
return True | |
else: | |
return False | |
longest_suffix = max(suffixes, key=len) | |
max_length = min(len(word), len(longest_suffix)) | |
word_section = word[-max_length:] | |
for suffix in suffixes: | |
if suffix in word_section: | |
return True | |
else: | |
return False | |
def is_verb(word): | |
for suffix, requires_prefix in verb_endings_require_prefix.items(): | |
if word.endswith(suffix): | |
if requires_prefix: | |
for prefix in verb_prefixes[0]: | |
if word.startswith(prefix): | |
return True | |
else: | |
return False | |
else: | |
# a suffix is sufficient to identify this as a verb | |
return True | |
else: | |
# this word does not end with any verb suffixes and so is not a verb | |
return False | |
def part_of_speech(word): | |
if is_verb(word): | |
return WordType.verb | |
elif (prefix_in_word(word, only_noun_prefixes) | |
or suffix_in_word(word, only_noun_suffixes)): | |
return WordType.noun | |
else: | |
return WordType.other | |
def segment_word(word, prefixes, suffixes, epenthetic='ə'): | |
my_prefixes, my_suffixes = [], [] | |
for prefix_slot in prefixes: | |
for prefix in prefix_slot: | |
word_section = word[:len(prefix)] | |
if word_section == prefix: | |
my_prefixes.append(prefix) | |
word = word[len(prefix):] | |
# check for an epenthetic vowel | |
if word.startswith(epenthetic): | |
my_prefixes.append(epenthetic) | |
word = word[1:] | |
# no other prefixes can occur in this slot | |
break | |
else: | |
my_prefixes.append('') | |
# do the suffixes in reverse order | |
for suffix_slot in reversed(suffixes): | |
for suffix in suffix_slot: | |
word_section = word[-len(suffix):] | |
if word_section == suffix: | |
my_suffixes.append(suffix) | |
word = word[:-len(suffix)] | |
if word.endswith(epenthetic): | |
my_suffixes.append(epenthetic) | |
word = word[:-1] | |
# no other suffixes can occur in this slot | |
break | |
else: | |
my_suffixes.append('') | |
# flip the order of suffixes back to left-to-right | |
my_suffixes = list(reversed(my_suffixes)) | |
# the remaining word is probably the root | |
root = word | |
morphemes = (my_prefixes, [root], my_suffixes) | |
return morphemes | |
def break_up_word(word): | |
# determine whether this is a noun or a verb | |
pos = part_of_speech(word) | |
if pos is WordType.noun: | |
# segment the word into prefixes | |
morphemes = segment_word(word, noun_prefixes, noun_suffixes) | |
elif pos is WordType.verb: | |
morphemes = segment_word(word, verb_prefixes, verb_suffixes) | |
else: | |
# this is neither a noun nor a verb | |
return None | |
# join the morphemes with a dash | |
dashed_word = '-'.join(morpheme for morpheme in chain.from_iterable(morphemes) | |
if morpheme) | |
return morphemes, pos, dashed_word | |
def find_glosses_affixes(affixes, glosses): | |
my_glosses = [] | |
for n, affix in enumerate(affixes): | |
if affix: | |
try: | |
gloss = glosses[n][affix] | |
except KeyError: | |
# this prefix is not in the glosses | |
gloss = '???' | |
except IndexError: | |
gloss = '*' | |
my_glosses.append(gloss) | |
return my_glosses | |
def gloss_word(word_morphemes, pos): | |
prefixes, root, suffixes = word_morphemes | |
if pos is WordType.noun: | |
prefix_glosses = noun_prefix_glosses | |
suffix_glosses = noun_suffix_glosses | |
elif pos is WordType.verb: | |
prefix_glosses = verb_prefix_glosses | |
suffix_glosses = verb_suffix_glosses | |
prefix_glosses = find_glosses_affixes(prefixes, prefix_glosses) | |
root_gloss = root # replace with something else (like ???) if you want to | |
suffix_glosses = find_glosses_affixes(suffixes, suffix_glosses) | |
all_glosses = prefix_glosses + [root_gloss] + suffix_glosses | |
return all_glosses | |
def dash_insertion_reduplicate(word, length): | |
return (word[:-length] + "-" + word[-length:]) | |
def format_epenthetic(word, epenthetic='ə'): | |
if not word: | |
# this is not a word | |
return None | |
if (('-'+epenthetic in word or epenthetic+'-' in word) | |
and '-{}-'.format(epenthetic) not in word): | |
#print('word:', word) | |
# check that there are no empty splits because of double dashes | |
split_word = word.split('-') | |
substrings = [] | |
# check that this schwa is not word-initial or word-final | |
for n, substring in enumerate(split_word): | |
if (substring.startswith(epenthetic) and | |
substring.endswith(epenthetic) and 0 < n < len(word)): | |
new_substring = [epenthetic, substring[1:-1], epenthetic] | |
elif substring.startswith(epenthetic) and n > 0: | |
new_substring = [epenthetic, substring[1:]] | |
elif substring.endswith(epenthetic) and n < len(word): | |
new_substring = [substring[:-1], epenthetic] | |
else: | |
new_substring = [substring] | |
substrings += new_substring | |
word = '-'.join(substrings) | |
return word | |
if __name__ == '__main__': | |
with open(filepath, encoding='utf-16') as text_file: | |
original_text = text_file.read() | |
word_expression = '\w+' | |
words = re.findall(word_expression, original_text) | |
morphed_words = [break_up_word(word) for word in words if word] | |
to_gloss = ((word[0], word[1]) for word in morphed_words if word) | |
glossed_words = [gloss_word(morphemes, pos) for morphemes, pos in to_gloss] | |
with open('output.txt', 'w') as output_file: | |
pprint.pprint(morphed_words, stream=output_file) | |
# print('*****GLOSSES******', file=output_file) | |
# pprint.pprint(glossed_words, stream=output_file) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment