Last active
September 25, 2024 17:16
-
-
Save planetis-m/0b5df9b61de0ff972f1306cf2ab0de4c to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def lower_first_if_title(word): | |
if word.isupper(): | |
return word | |
elif word.istitle(): | |
return word[0].lower() + word[1:] | |
else: | |
return word | |
def count_greek_syllables(word): | |
vowels = 'αάεέηήιίοόυύωώϊϋΐΰ' | |
diphthongs = { | |
'αι', 'ει', 'οι', 'υι', 'αυ', 'ευ', 'ου', | |
'αί', 'εί', 'οί', 'υί', 'αύ', 'εύ', 'ού', | |
'αη', 'αϊ', 'οη', 'όη', 'οϊ', 'άι', 'όι', 'εϊ' | |
} | |
spurious_diphthongs = 'ιυ' | |
spurious_diphthongs_long = {'οι', 'ει'} | |
vowel_digraphs = { | |
'αι', 'ει', 'οι', 'ου', 'υι', | |
'αί', 'εί', 'οί', 'ού', 'υί' | |
} | |
qualifying_vowels = 'αοεάόέ' | |
word_len = len(word) | |
syllable_count = 0 | |
i = 0 | |
while i < word_len: | |
# Handle short spurious diphthongs ('ι', 'υ') | |
if i < word_len - 1 and word[i] in spurious_diphthongs: | |
# Check if the next two characters form a vowel digraph | |
if i < word_len - 2 and word[i+1:i+3] in vowel_digraphs: | |
i += 3 | |
# Check if the next character is a qualifying vowel | |
elif word[i+1] in qualifying_vowels: | |
i += 2 | |
else: | |
i += 1 | |
syllable_count += 1 | |
# Handle long spurious diphthongs ('οι', 'ει') | |
elif i < word_len - 2 and word[i:i+2] in spurious_diphthongs_long: | |
if i < word_len - 3 and word[i+2:i+4] in vowel_digraphs: | |
i += 4 | |
elif word[i+2] in qualifying_vowels: | |
i += 3 | |
else: | |
i += 2 | |
syllable_count += 1 | |
else: | |
# Handle standard diphthongs | |
if i < word_len - 1 and word[i:i+2] in diphthongs: | |
syllable_count += 1 | |
i += 2 | |
# Handle single vowels | |
elif word[i] in vowels: | |
syllable_count += 1 | |
i += 1 | |
else: | |
i += 1 | |
return syllable_count | |
# Test the function | |
test_words = ['καλημέρα', 'άνθρωπος', 'ευχαριστώ', 'δίφθογγος', 'αίμα', 'είναι', 'ούτε', # start of special | |
'Σάββατο', 'κόκκινος', 'βλέμμα', 'ιππότης', 'κόλλα', 'υπάλληλος', 'θάρρος', | |
'σύννεφο', 'Σάββας', 'γράμμα', 'μαλλί', 'παππούς', 'συλλαβή', 'άρρωστος', | |
'επίρρημα', 'κόμμα', 'εννοώ', 'λάκκος', 'άμμος', 'βύσσινο', 'γλώσσα', 'θάλασσα', | |
'κρεμμύδι', 'κύτταρο', 'εννιά', 'μέλισσα', 'εκκλησία', 'ελάττωμα', 'γενναίος', | |
'ήλιος', 'ελιές', 'παλιού', 'ποια', 'ποιοι', 'ποιες', 'ποιους', 'άδειες', | |
'άδειοι', 'δίχτυα', 'διχτυού', 'Γιάννα', 'ποιος', 'άδειασε', 'γυάλα', 'αηδόνι', | |
'βόηθα', 'αϊτός', 'γάιδαρος', 'κοροϊδεύω', 'ρόιδι', # up to this point ok | |
'αδειάζω', 'αδειανός', 'άδειασμα', 'άδειος', 'μπάνιο', # ok | |
'αγάπη', 'ποίημα', 'δουλειά', 'αδειάζω', 'κάποιος', 'κρυώνω', 'ησυχία', | |
'ανθόκηπος', 'άλογο', 'τσουγκράνα', 'αυτοκίνητο', 'Ιωάννα', 'ευτυχία', | |
'κόκκινος', 'λαχανόκηπος', 'κινδυνεύω', 'πίεσα', 'βόλεϊ', | |
'Αττική', 'Υμηττός', 'Έλληνας', 'Άννα', 'Αλβανία', # ok | |
'κορόιδο', 'κορόιδεμα', 'κοροϊδευτικός', 'κοροϊδεύω', 'κοροϊδία', 'κοροϊδίστικος', # ok | |
'μία', 'δύο', 'βίος', 'ή', 'πού', 'πώς', # ok | |
'µια', 'δυο', 'για', 'γεια', 'πια', 'ποιος', 'γιος', 'νιος', 'πιες', 'πιε', # one syllable | |
'άδεια', 'αδειοδότηση', 'αδειοδοτικός', 'αδειοδοτώ', 'αδειοδωρόσημο', 'αδειούχος', # all wrong | |
'διαβατήριο', 'Εύβοια', 'πιω'] # +3 wrong | |
for word in test_words: | |
print(f"{word}: {count_greek_syllables(lower_first_if_title(word))} syllables") | |
# Misc | |
def is_diaeresis_correct(word): | |
diaeresis_chars = 'ϊϋΐΰ' | |
vowels_with_accent = 'άέήίόύώ' | |
diphthongs = {'αι', 'ει', 'οι', 'υι', 'αυ', 'ευ', 'ου'} | |
diaeresis_found = False | |
for i, char in enumerate(word): | |
if char in diaeresis_chars: | |
if diaeresis_found: | |
return False | |
else: diaeresis_found = True | |
prev_char = word[i-1] | |
# Rule 1: Check if the preceding vowel is stressed. | |
if i >= 1 and prev_char in vowels_with_accent: | |
return False | |
# Rule 2: Check if there's a diphthong before the diaeresis. | |
# (Accented diphthongs are caught by the previous rule.) | |
if i >= 2 and word[i-2:i] in diphthongs: | |
return False | |
# Rule 3: Check if there's a non-diphthong in the word. | |
# Only consider vowel combinations ending in ι or υ (e.g., ηυ, ιυ, ωυ, ηι, ωι). | |
if i >= 1 and ((char in 'ϊΐ' and prev_char in 'ηω') or \ | |
(char in 'ϋΰ' and prev_char in 'ηιω')): | |
return False | |
# If none of the rules apply, diaeresis might be necessary. | |
return True | |
def remove_greek_accents(text): | |
# Normalize to decomposed form (NFD) | |
nfd_form = unicodedata.normalize('NFD', text) | |
# Remove combining diacritical marks from capital letters, but keep dieresis marks | |
result = [] | |
for c in nfd_form: | |
if unicodedata.combining(c) and c != '̈': | |
# Check if the base character is a capital letter | |
base_char = nfd_form[nfd_form.index(c) - 1] | |
if 'Α' <= base_char <= 'Ω': | |
continue # Skip combining marks for capital letters | |
result.append(c) | |
# Reconstruct the string | |
return unicodedata.normalize('NFC', ''.join(result)) | |
def count_greek_accents(word): | |
accented_vowels = 'άέήίόύώΐΰΆΈΉΊΌΎΏ' | |
accent_count = 0 | |
for char in word: | |
if char in accented_vowels: | |
accent_count += 1 | |
return accent_count |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment