Created
October 4, 2017 03:00
-
-
Save rgov/ade0094bc4f0e5c54813eabd8b925bfe to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# -*- coding: utf-8 -*- | |
''' | |
This module attempts to implement syllabification for Greek words. | |
The consonant clusters are derived from "Greek: A Comprehensive Grammar of the | |
Modern Language" by D. Holton, et al. | |
''' | |
import phonemes | |
import re | |
__all__ = [ 'syllabificate', 'get_stressed_syllable', 'get_vowel', | |
'get_prevowel_sound' ] | |
vowelmodifiers = list(u'ˌˈ') | |
vowels = list(u'ieaouɪə') | |
palatals = list(u'jçɲʎ') | |
consonants = list(u'ptkfθxvδɣszlrmn') + palatals | |
wordinitial = list(consonants) + [ | |
u'ps', u'ts', u'ks', u'pr', u'tr', u'kr', | |
u'pl', u'pn', u'kl', u'kn', | |
u'tm', | |
u'mn', | |
u'fr', u'fl', u'θl', u'θr', u'xl', u'xr', u'vl', u'δr', u'ɣl', u'ɣr', | |
u'sp', u'st', u'sk', u'sf', u'sθ', u'sx', | |
u'sn', u'zn', u'sl', u'zl', | |
u'zb', | |
u'ɣn', u'θn', u'xn', u'zm', | |
u'ft', u'fk', u'xt', | |
u'vδ', u'vɣ', u'ɣδ', u'zv', u'zɣ', | |
u'fθ', u'xθ', u'kt', u'pt', | |
u'bl', u'br', u'dr', u'gr', u'gl', | |
u'δj', u'νj', u'pç' | |
u'spl', u'spr', u'skl', u'str', u'skr', | |
u'skn', | |
u'sfr' | |
] | |
wordmedial = list(wordinitial) + [ | |
u'tn', u'fn', u'vn', u'θm', u'vm', | |
u'rt', u'rδ', u'rf', u'rk', u'rx', u'rn', u'rm', u'rɣ', u'lm', u'lp', u'lk', | |
u'lt', u'lɣ', | |
u'nθ', u'nx', u'mf', u'ns', u'nδ', u'mv', u'nɣ', u'nz', | |
u'sxr', u'sxn', u'sθm', u'ptr', | |
u'ktr', u'kst', u'kδr', u'ngl', u'mbr', u'ftr', u'fst', u'vɣl' u'rtr', | |
u'kstr', u'fstr' | |
] | |
def greedy_split(phonemes): | |
patterns = list() | |
patterns.extend('^' + x for x in sorted(wordinitial, key=len, reverse=True)) | |
patterns.extend(sorted(wordmedial, key=len, reverse=True)) | |
patterns.extend('[' + ''.join(vowelmodifiers) + ']*' + v for v in vowels) | |
regex = '(' + '|'.join(patterns) + ')' | |
result = re.split(regex, ''.join(phonemes), flags=re.UNICODE) | |
return [ x for x in result if x != '' ] | |
def is_vowel(syllable): | |
return phonemes.strip_modifiers(syllable) in vowels | |
def is_consonant_cluster(syllable): | |
return all(not is_vowel(p) for p in syllable) | |
def form_syllables(l): | |
syllables, temp = [], '' | |
for i, group in enumerate(l): | |
# If this is consonant cluster, and it's the last group, tack it onto the | |
# previous syllable | |
if is_consonant_cluster(group) and i == len(l) - 1: | |
syllables[-1] += group | |
continue | |
# If this is a consonant cluster, and the next one is a consonant cluster, | |
# tack this onto the previous syllable too | |
if is_consonant_cluster(group) and is_consonant_cluster(l[i + 1]): | |
syllables[-1] += group | |
continue | |
# If this is a consonant cluster, and next up is a vowel, we get tacked | |
# onto that vowel | |
if is_consonant_cluster(group): | |
assert temp == '' | |
temp = group | |
continue | |
# If this is a vowel, then we form a new syllable | |
if is_vowel(group): | |
syllables.append(temp + group) | |
temp = '' | |
return tuple(syllables) | |
def syllabificate(phonemes): | |
return form_syllables(greedy_split(phonemes)) | |
def get_stressed_syllable(syllables): | |
''' | |
Returns a negative index for the stressed syllable (i.e., last is -1) | |
''' | |
for i in xrange(-1, -len(syllables) - 1, -1): | |
if u'ˈ' in syllables[i]: | |
return i | |
raise Exception('stressed syllable not found') | |
def get_vowel(syllable): | |
for c in syllable: | |
if is_vowel(c): | |
return c | |
raise Exception('vowel not found') | |
def get_prevowel_sound(syllable): | |
syllable = phonemes.strip_modifiers(syllable) | |
for i, c in enumerate(syllable): | |
if is_vowel(c): | |
return syllable[:i-1] | |
return '' |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment