Skip to content

Instantly share code, notes, and snippets.

@anna-hope
Forked from rma2015/chkparser2
Created August 18, 2015 18:45
Show Gist options
  • Save anna-hope/41f1b94cac6d00b17f99 to your computer and use it in GitHub Desktop.
Save anna-hope/41f1b94cac6d00b17f99 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python3
from enum import Enum
from itertools import chain
import re
import pprint
filepath = "chukchi_texts.txt"
#pprint.pprint(words)
def suffix(word, affix_recessive, affix_dominant = "\0"):
return word.endswith(affix_recessive) or word.endswith(affix_dominant)
def prefix(word, affix_recessive, affix_dominant = "\0"):
return word.startswith(affix_recessive) or word.startswith(affix_dominant)
def in_affix(word, affix_recessive, affix_dominant = "\0"):
return affix_recessive in word or affix_dominant in word
AFFIXES = {'nouns': {'prefixes': [['ɣe', 'ɣa'], ['mejŋ', 'taŋ']],
'suffixes': [['ɣərɣ', 'ɬʔ'],
['cɣ'],
['jekwe', 'cəku', 'cəko', 'corm', 'curm'],
['jikwi',
'jekwe',
'ɣəpə',
'ɣjit',
'cəko',
'cəku',
'ɣjet',
'ɣtə',
'jpə',
'etə',
'epə',
'nu',
'te',
'no',
'ŋə',
'ta',
't',
'a',
'e',
'n',
'k']]},
'verbs': {'prefixes': [['mənʔ',
'mət',
'mən',
'ʔən',
'nʔ',
'mʔ',
'ne',
'na',
'ɣe',
'ɣa',
't',
'm',
'n',
'q'],
['re', 'ra', 'r'],
['ine', 'ena', 'in', 'en'],
['r', 'n']],
'suffixes': [['et', 'at', 'ew', 'aw'],
['tku', 'tko'],
['pɬətku', 'ŋŋo'],
['qenat',
'ninet',
'ɬinet',
'ɬenat',
'nenat',
'qinet',
'rkəni',
'rkəne',
'jɣəm',
'nin',
'nen',
'ɬin',
'ɬen',
'qin',
'qen',
'ɣʔe',
'ɣʔa',
'ɣʔi',
'ɣe',
'ɣi',
'rk'],
['muri',
'more',
'turi',
'tore',
'rkən',
'ɣəm',
'ɣət',
'n',
't',
'k']]}}
verb_endings_require_prefix = {'k': True,
'nin': False,
'nen': False,
'ɬinet': False,
'ɬenat': False,
'ɬin': False,
'ɬen': False,
'qin': True,
'qen': True,
'qinet': True,
'qenat': True,
'jɣəm': True,
'ɣe': False,
'ɣi': False,
'ninet': False,
'nenat': False,
't': True,
'ɣəm': True,
'ɣət': True,
'muri': True,
'more': True,
'turi': True,
'tore': True,
'ɣʔe': False,
'ɣʔa': False,
'rkən': False,
'rkəne': False,
'rkəni': False}
all_verb_prefixes = set(chain.from_iterable(AFFIXES['verbs']['prefixes']))
all_verb_suffixes = set(chain.from_iterable(AFFIXES['verbs']['suffixes']))
all_noun_prefixes = set(chain.from_iterable(AFFIXES['nouns']['prefixes']))
all_noun_suffixes = set(chain.from_iterable(AFFIXES['nouns']['suffixes']))
only_noun_prefixes = all_noun_prefixes.difference(all_verb_prefixes)
only_noun_suffixes = all_noun_suffixes.difference(all_verb_suffixes)
all_prefixes = set(chain(all_verb_prefixes, all_noun_prefixes))
all_suffixes = set(chain(all_verb_suffixes, all_noun_suffixes))
verb_prefix_glosses = [{'t': '1SG.S/A.IND',
'mət': '1PL.S/A.IND',
'm': '1SG.S/A.INT',
'n': '3.S/A.INT',
'q': '2.S/A.INT',
'mən': '1PL.S/A.INT',
'mʔ': '1SG.S/A.COND',
'mənʔ': '1PL.S/A.COND',
'nʔ': 'N1.S/A.COND',
'ne': '3A',
'na': '3A',
'ʔən': '3A.INT'},
{'re': 'FUT', 'ra': 'FUT', 'r': 'FUT'},
{'ine': '1SG.O',
'ena': '1SG.O',
'in': '1SG.O',
'en': '1SG.O'},
{'r': 'CS', 'n': 'CS'}]
verb_suffix_glosses = [{'et': 'TH', 'at': 'TH', 'ew': 'TH', 'aw': 'TH'},
{'k': '1SG.S/A.IND//INF', 'nin': '3SG.S>3SG.O', 'nen':'3SG.S>3SG.O',
'ɣʔe': 'TH', 'ɣʔa': 'TH', 'tək': '2PL', 'tkə': '2PL.A>3.O',
'ɣəm': '1SG.S/A','ɣət': '2SG.S/A','muri': '1PL.S/A','more': '1PL.S/A',
'turi': '2PL.S/A','tore': '2PL.S/A'}]
noun_prefix_glosses = [{'ɣe':'COM', 'ɣa':'COM'},
{'taŋ':'INTS'}]
noun_suffix_glosses = [{'ŋə':'ABS', 'te': 'ERG', 'ta':'ERG', 'e':'ERG', 'a':'ERG',
'jtə':'ALL', 'ɣtə':'ALL', 'jpə':'ABL', 'ɣəpə':'ABL',
'ɣpə':'ABL', 'ɣjit':'ORIENT', 'ɣjet':'ORIENT', 'nu':'EQU', 'no':'EQU'}]
# figure out which affixes only occur in verbs and which only in nouns
# we will use this later to determine whether the word is a verb or a noun
class WordType(Enum):
other = 0
noun = 1
verb = 2
adjective = 3
def prefix_in_word(word, prefixes, must_start=False):
if must_start:
for prefix in prefixes:
if word.startswith(prefix):
return True
else:
return False
longest_prefix = max(prefixes, key=len)
max_length = min(len(word), len(longest_prefix))
word_section = word[:max_length]
for prefix in prefixes:
if prefix in word_section:
return True
else:
return False
def suffix_in_word(word, suffixes, must_end=False):
if must_end:
for suffix in suffixes:
if word.endswith(suffix):
return True
else:
return False
longest_suffix = max(suffixes, key=len)
max_length = min(len(word), len(longest_suffix))
word_section = word[-max_length:]
for suffix in suffixes:
if suffix in word_section:
return True
else:
return False
def is_verb(word):
for suffix, requires_prefix in verb_endings_require_prefix.items():
if word.endswith(suffix):
if requires_prefix:
for prefix in sorted_affixes['verbs']['prefixes'][0]:
if word.startswith(prefix):
return True
else:
return False
else:
# a suffix is sufficient to identify this as a verb
return True
else:
# this word does not end with any verb suffixes and so is not a verb
return False
def part_of_speech(word):
if is_verb(word):
return WordType.verb
elif (prefix_in_word(word, only_noun_prefixes)
or suffix_in_word(word, only_noun_suffixes)):
return WordType.noun
else:
return WordType.other
def segment_word(word, affixes, epenthetic='ə'):
my_prefixes, my_suffixes = [], []
prefixes, suffixes = affixes['prefixes'], affixes['suffixes']
for prefix_slot in prefixes:
for prefix in prefix_slot:
word_section = word[:len(prefix)]
if word_section == prefix:
my_prefixes.append(prefix)
word = word[len(prefix):]
# check for an epenthetic vowel
if word.startswith(epenthetic):
my_prefixes.append(epenthetic)
word = word[1:]
# no other prefixes can occur in this slot
break
else:
my_prefixes.append('')
# do the suffixes in reverse order
for suffix_slot in reversed(suffixes):
for suffix in suffix_slot:
word_section = word[-len(suffix):]
if word_section == suffix:
my_suffixes.append(suffix)
word = word[:-len(suffix)]
if word.endswith(epenthetic):
my_suffixes.append(epenthetic)
word = word[:-1]
# no other suffixes can occur in this slot
break
else:
my_suffixes.append('')
# flip the order of suffixes back to left-to-right
my_suffixes = list(reversed(my_suffixes))
# the remaining word is probably the root
root = word
morphemes = (my_prefixes, [root], my_suffixes)
return morphemes
def break_up_word(word):
# determine whether this is a noun or a verb
pos = part_of_speech(word)
if pos is WordType.noun:
affixes = sorted_affixes['nouns']
elif pos is WordType.verb:
affixes = sorted_affixes['verbs']
else:
# this is neither a noun nor a verb
return None
morphemes = segment_word(word, affixes)
# join the morphemes with a dash
dashed_word = '-'.join(morpheme for morpheme in chain.from_iterable(morphemes)
if morpheme)
return morphemes, pos, dashed_word
def find_glosses_affixes(affixes, glosses):
my_glosses = []
for n, affix in enumerate(affixes):
if affix:
try:
gloss = glosses[n][affix]
except KeyError:
# this prefix is not in the glosses
gloss = '???'
except IndexError:
gloss = '*'
my_glosses.append(gloss)
return my_glosses
def gloss_word(word_morphemes, pos):
prefixes, root, suffixes = word_morphemes
if pos is WordType.noun:
prefix_glosses = noun_prefix_glosses
suffix_glosses = noun_suffix_glosses
elif pos is WordType.verb:
prefix_glosses = verb_prefix_glosses
suffix_glosses = verb_suffix_glosses
prefix_glosses = find_glosses_affixes(prefixes, prefix_glosses)
root_gloss = root # replace with something else (like ???) if you want to
suffix_glosses = find_glosses_affixes(suffixes, suffix_glosses)
all_glosses = prefix_glosses + [root_gloss] + suffix_glosses
return all_glosses
def dash_insertion_reduplicate(word, length):
return (word[:-length] + "-" + word[-length:])
def format_epenthetic(word, epenthetic='ə'):
if not word:
# this is not a word
return None
if (('-'+epenthetic in word or epenthetic+'-' in word)
and '-{}-'.format(epenthetic) not in word):
#print('word:', word)
# check that there are no empty splits because of double dashes
split_word = word.split('-')
substrings = []
# check that this schwa is not word-initial or word-final
for n, substring in enumerate(split_word):
if (substring.startswith(epenthetic) and
substring.endswith(epenthetic) and 0 < n < len(word)):
new_substring = [epenthetic, substring[1:-1], epenthetic]
elif substring.startswith(epenthetic) and n > 0:
new_substring = [epenthetic, substring[1:]]
elif substring.endswith(epenthetic) and n < len(word):
new_substring = [substring[:-1], epenthetic]
else:
new_substring = [substring]
substrings += new_substring
word = '-'.join(substrings)
return word
def sort_affixes(affixes):
sorted_affixes = {}
for part_of_speech, affix_types in affixes.items():
pos_affixes = {}
for affix_type in affix_types:
affix_slots = affixes[part_of_speech][affix_type]
sorted_affixes_slots = [sorted(affix_slot, key=len, reverse=True)
for affix_slot in affix_slots]
pos_affixes[affix_type] = sorted_affixes_slots
sorted_affixes[part_of_speech] = pos_affixes
return sorted_affixes
if __name__ == '__main__':
with open(filepath) as text_file:
original_text = text_file.read()
word_expression = '\w+'
words = re.findall(word_expression, original_text)
sorted_affixes = sort_affixes(AFFIXES)
morphed_words = [break_up_word(word) for word in words if word]
to_gloss = ((word[0], word[1]) for word in morphed_words if word)
glossed_words = [gloss_word(morphemes, pos) for morphemes, pos in to_gloss]
with open('output.txt', 'w') as output_file:
pprint.pprint(morphed_words, stream=output_file)
# print('*****GLOSSES******', file=output_file)
# pprint.pprint(glossed_words, stream=output_file)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment