planetis-m · September 25, 2024 17:16
diff --git a/count_syllables.py b/count_syllables.py
 def lower_first_if_title(word):
  if word.isupper():
    return word
  elif word.istitle():
    return word[0].lower() + word[1:]
  else:
    return word

 def count_greek_syllables(word):
  vowels = 'αάεέηήιίοόυύωώϊϋΐΰ'
  diphthongs = {
    'αι', 'ει', 'οι', 'υι', 'αυ', 'ευ', 'ου',
    'αί', 'εί', 'οί', 'υί', 'αύ', 'εύ', 'ού',
    'αη', 'αϊ', 'οη', 'όη', 'οϊ', 'άι', 'όι', 'εϊ'
  }
  spurious_diphthongs = 'ιυ'
  spurious_diphthongs_long = {'οι', 'ει'}
  vowel_digraphs = {
    'αι', 'ει', 'οι', 'ου', 'υι',
    'αί', 'εί', 'οί', 'ού', 'υί'
  }
  qualifying_vowels = 'αοεάόέ'
  word_len = len(word)
  syllable_count = 0
  i = 0

  while i < word_len:
    # Handle short spurious diphthongs ('ι', 'υ')
    if i < word_len - 1 and word[i] in spurious_diphthongs:
      # Check if the next two characters form a vowel digraph
      if i < word_len - 2 and word[i+1:i+3] in vowel_digraphs:
        i += 3
      # Check if the next character is a qualifying vowel
      elif word[i+1] in qualifying_vowels:
        i += 2
      else:
        i += 1
      syllable_count += 1
    # Handle long spurious diphthongs ('οι', 'ει')
    elif i < word_len - 2 and word[i:i+2] in spurious_diphthongs_long:
      if i < word_len - 3 and word[i+2:i+4] in vowel_digraphs:
        i += 4
      elif word[i+2] in qualifying_vowels:
        i += 3
      else:
        i += 2
      syllable_count += 1
    else:
      # Handle standard diphthongs
      if i < word_len - 1 and word[i:i+2] in diphthongs:
        syllable_count += 1
        i += 2
      # Handle single vowels
      elif word[i] in vowels:
        syllable_count += 1
        i += 1
      else:
        i += 1
  return syllable_count

 # Test the function
 test_words = ['καλημέρα', 'άνθρωπος', 'ευχαριστώ', 'δίφθογγος', 'αίμα', 'είναι', 'ούτε', # start of special
              'Σάββατο', 'κόκκινος', 'βλέμμα', 'ιππότης', 'κόλλα', 'υπάλληλος', 'θάρρος',
              'σύννεφο', 'Σάββας', 'γράμμα', 'μαλλί', 'παππούς', 'συλλαβή', 'άρρωστος',
              'επίρρημα', 'κόμμα', 'εννοώ', 'λάκκος', 'άμμος', 'βύσσινο', 'γλώσσα', 'θάλασσα',
              'κρεμμύδι', 'κύτταρο', 'εννιά', 'μέλισσα', 'εκκλησία', 'ελάττωμα', 'γενναίος',
              'ήλιος', 'ελιές', 'παλιού', 'ποια', 'ποιοι', 'ποιες', 'ποιους', 'άδειες',
              'άδειοι', 'δίχτυα', 'διχτυού', 'Γιάννα', 'ποιος', 'άδειασε', 'γυάλα', 'αηδόνι',
              'βόηθα', 'αϊτός', 'γάιδαρος', 'κοροϊδεύω', 'ρόιδι', # up to this point ok
              'αδειάζω', 'αδειανός', 'άδειασμα', 'άδειος', 'μπάνιο', # ok
              'αγάπη', 'ποίημα', 'δουλειά', 'αδειάζω', 'κάποιος', 'κρυώνω', 'ησυχία',
              'ανθόκηπος', 'άλογο', 'τσουγκράνα', 'αυτοκίνητο', 'Ιωάννα', 'ευτυχία',
              'κόκκινος', 'λαχανόκηπος', 'κινδυνεύω', 'πίεσα', 'βόλεϊ',
              'Αττική', 'Υμηττός', 'Έλληνας', 'Άννα', 'Αλβανία', # ok
              'κορόιδο', 'κορόιδεμα', 'κοροϊδευτικός', 'κοροϊδεύω', 'κοροϊδία', 'κοροϊδίστικος', # ok
              'μία', 'δύο', 'βίος', 'ή', 'πού', 'πώς', # ok
              'µια', 'δυο', 'για', 'γεια', 'πια', 'ποιος', 'γιος', 'νιος', 'πιες', 'πιε', # one syllable
              'άδεια', 'αδειοδότηση', 'αδειοδοτικός', 'αδειοδοτώ', 'αδειοδωρόσημο', 'αδειούχος', # all wrong
              'διαβατήριο', 'Εύβοια', 'πιω'] # +3 wrong

 for word in test_words:
  print(f"{word}: {count_greek_syllables(lower_first_if_title(word))} syllables")

 # Misc

 def is_diaeresis_correct(word):
  diaeresis_chars = 'ϊϋΐΰ'
  vowels_with_accent = 'άέήίόύώ'
  diphthongs = {'αι', 'ει', 'οι', 'υι', 'αυ', 'ευ', 'ου'}

  diaeresis_found = False
  for i, char in enumerate(word):
    if char in diaeresis_chars:
      if diaeresis_found:
        return False
      else: diaeresis_found = True
      prev_char = word[i-1]
      # Rule 1: Check if the preceding vowel is stressed.
      if i >= 1 and prev_char in vowels_with_accent:
        return False
      # Rule 2: Check if there's a diphthong before the diaeresis.
      # (Accented diphthongs are caught by the previous rule.)
      if i >= 2 and word[i-2:i] in diphthongs:
        return False
      # Rule 3: Check if there's a non-diphthong in the word.
      # Only consider vowel combinations ending in ι or υ (e.g., ηυ, ιυ, ωυ, ηι, ωι).
      if i >= 1 and ((char in 'ϊΐ' and prev_char in 'ηω') or \
          (char in 'ϋΰ' and prev_char in 'ηιω')):
        return False
  # If none of the rules apply, diaeresis might be necessary.
  return True

 def remove_greek_accents(text):
  # Normalize to decomposed form (NFD)
  nfd_form = unicodedata.normalize('NFD', text)
  # Remove combining diacritical marks from capital letters, but keep dieresis marks
  result = []
  for c in nfd_form:
    if unicodedata.combining(c) and c != '̈':
      # Check if the base character is a capital letter
      base_char = nfd_form[nfd_form.index(c) - 1]
      if 'Α' <= base_char <= 'Ω':
        continue  # Skip combining marks for capital letters
    result.append(c)
  # Reconstruct the string
  return unicodedata.normalize('NFC', ''.join(result))

 def count_greek_accents(word):
  accented_vowels = 'άέήίόύώΐΰΆΈΉΊΌΎΏ'
  accent_count = 0
  for char in word:
    if char in accented_vowels:
      accent_count += 1
  return accent_count
	def lower_first_if_title(word):
	if word.isupper():
	return word
	elif word.istitle():
	return word[0].lower() + word[1:]
	else:
	return word

	def count_greek_syllables(word):
	vowels = 'αάεέηήιίοόυύωώϊϋΐΰ'
	diphthongs = {
	'αι', 'ει', 'οι', 'υι', 'αυ', 'ευ', 'ου',
	'αί', 'εί', 'οί', 'υί', 'αύ', 'εύ', 'ού',
	'αη', 'αϊ', 'οη', 'όη', 'οϊ', 'άι', 'όι', 'εϊ'
	}
	spurious_diphthongs = 'ιυ'
	spurious_diphthongs_long = {'οι', 'ει'}
	vowel_digraphs = {
	'αι', 'ει', 'οι', 'ου', 'υι',
	'αί', 'εί', 'οί', 'ού', 'υί'
	}
	qualifying_vowels = 'αοεάόέ'
	word_len = len(word)
	syllable_count = 0
	i = 0

	while i < word_len:
	# Handle short spurious diphthongs ('ι', 'υ')
	if i < word_len - 1 and word[i] in spurious_diphthongs:
	# Check if the next two characters form a vowel digraph
	if i < word_len - 2 and word[i+1:i+3] in vowel_digraphs:
	i += 3
	# Check if the next character is a qualifying vowel
	elif word[i+1] in qualifying_vowels:
	i += 2
	else:
	i += 1
	syllable_count += 1
	# Handle long spurious diphthongs ('οι', 'ει')
	elif i < word_len - 2 and word[i:i+2] in spurious_diphthongs_long:
	if i < word_len - 3 and word[i+2:i+4] in vowel_digraphs:
	i += 4
	elif word[i+2] in qualifying_vowels:
	i += 3
	else:
	i += 2
	syllable_count += 1
	else:
	# Handle standard diphthongs
	if i < word_len - 1 and word[i:i+2] in diphthongs:
	syllable_count += 1
	i += 2
	# Handle single vowels
	elif word[i] in vowels:
	syllable_count += 1
	i += 1
	else:
	i += 1
	return syllable_count

	# Test the function
	test_words = ['καλημέρα', 'άνθρωπος', 'ευχαριστώ', 'δίφθογγος', 'αίμα', 'είναι', 'ούτε', # start of special
	'Σάββατο', 'κόκκινος', 'βλέμμα', 'ιππότης', 'κόλλα', 'υπάλληλος', 'θάρρος',
	'σύννεφο', 'Σάββας', 'γράμμα', 'μαλλί', 'παππούς', 'συλλαβή', 'άρρωστος',
	'επίρρημα', 'κόμμα', 'εννοώ', 'λάκκος', 'άμμος', 'βύσσινο', 'γλώσσα', 'θάλασσα',
	'κρεμμύδι', 'κύτταρο', 'εννιά', 'μέλισσα', 'εκκλησία', 'ελάττωμα', 'γενναίος',
	'ήλιος', 'ελιές', 'παλιού', 'ποια', 'ποιοι', 'ποιες', 'ποιους', 'άδειες',
	'άδειοι', 'δίχτυα', 'διχτυού', 'Γιάννα', 'ποιος', 'άδειασε', 'γυάλα', 'αηδόνι',
	'βόηθα', 'αϊτός', 'γάιδαρος', 'κοροϊδεύω', 'ρόιδι', # up to this point ok
	'αδειάζω', 'αδειανός', 'άδειασμα', 'άδειος', 'μπάνιο', # ok
	'αγάπη', 'ποίημα', 'δουλειά', 'αδειάζω', 'κάποιος', 'κρυώνω', 'ησυχία',
	'ανθόκηπος', 'άλογο', 'τσουγκράνα', 'αυτοκίνητο', 'Ιωάννα', 'ευτυχία',
	'κόκκινος', 'λαχανόκηπος', 'κινδυνεύω', 'πίεσα', 'βόλεϊ',
	'Αττική', 'Υμηττός', 'Έλληνας', 'Άννα', 'Αλβανία', # ok
	'κορόιδο', 'κορόιδεμα', 'κοροϊδευτικός', 'κοροϊδεύω', 'κοροϊδία', 'κοροϊδίστικος', # ok
	'μία', 'δύο', 'βίος', 'ή', 'πού', 'πώς', # ok
	'µια', 'δυο', 'για', 'γεια', 'πια', 'ποιος', 'γιος', 'νιος', 'πιες', 'πιε', # one syllable
	'άδεια', 'αδειοδότηση', 'αδειοδοτικός', 'αδειοδοτώ', 'αδειοδωρόσημο', 'αδειούχος', # all wrong
	'διαβατήριο', 'Εύβοια', 'πιω'] # +3 wrong

	for word in test_words:
	print(f"{word}: {count_greek_syllables(lower_first_if_title(word))} syllables")

	# Misc

	def is_diaeresis_correct(word):
	diaeresis_chars = 'ϊϋΐΰ'
	vowels_with_accent = 'άέήίόύώ'
	diphthongs = {'αι', 'ει', 'οι', 'υι', 'αυ', 'ευ', 'ου'}

	diaeresis_found = False
	for i, char in enumerate(word):
	if char in diaeresis_chars:
	if diaeresis_found:
	return False
	else: diaeresis_found = True
	prev_char = word[i-1]
	# Rule 1: Check if the preceding vowel is stressed.
	if i >= 1 and prev_char in vowels_with_accent:
	return False
	# Rule 2: Check if there's a diphthong before the diaeresis.
	# (Accented diphthongs are caught by the previous rule.)
	if i >= 2 and word[i-2:i] in diphthongs:
	return False
	# Rule 3: Check if there's a non-diphthong in the word.
	# Only consider vowel combinations ending in ι or υ (e.g., ηυ, ιυ, ωυ, ηι, ωι).
	if i >= 1 and ((char in 'ϊΐ' and prev_char in 'ηω') or \
	(char in 'ϋΰ' and prev_char in 'ηιω')):
	return False
	# If none of the rules apply, diaeresis might be necessary.
	return True

	def remove_greek_accents(text):
	# Normalize to decomposed form (NFD)
	nfd_form = unicodedata.normalize('NFD', text)
	# Remove combining diacritical marks from capital letters, but keep dieresis marks
	result = []
	for c in nfd_form:
	if unicodedata.combining(c) and c != '̈':
	# Check if the base character is a capital letter
	base_char = nfd_form[nfd_form.index(c) - 1]
	if 'Α' <= base_char <= 'Ω':
	continue # Skip combining marks for capital letters
	result.append(c)
	# Reconstruct the string
	return unicodedata.normalize('NFC', ''.join(result))

	def count_greek_accents(word):
	accented_vowels = 'άέήίόύώΐΰΆΈΉΊΌΎΏ'
	accent_count = 0
	for char in word:
	if char in accented_vowels:
	accent_count += 1
	return accent_count