planetis-m · October 3, 2024 12:59
diff --git a/build_dictionary_slow.py b/build_dictionary_slow.py
 import gzip
 import json
 import unicodedata
 from collections import Counter

 def validate_diaeresis_mark(word):
  diaeresis_chars = 'ϊϋΐΰ'
  vowels_with_accent = 'άέήίόύώ'
  diphthongs = {'αι', 'ει', 'οι', 'υι', 'αυ', 'ευ', 'ου'}
  diaeresis_found = False
  for i, char in enumerate(word):
    if char in diaeresis_chars:
      if diaeresis_found:
        return False
      else: diaeresis_found = True
      prev_char = word[i-1]
      # Rule 1: Check if the preceding vowel is stressed.
      if i >= 1 and prev_char in vowels_with_accent:
        return False
      # Rule 2: Check if there's a diphthong before the diaeresis.
      # (Accented diphthongs are caught by the previous rule.)
      if i >= 2 and word[i-2:i] in diphthongs:
        return False
      # Rule 3: Check if there's a non-diphthong in the word.
      # Only consider vowel combinations ending in ι or υ (e.g., ηυ, ιυ, ωυ, ηι, ωι).
      if i >= 1 and ((char in 'ϊΐ' and prev_char in 'ηω') or \
          (char in 'ϋΰ' and prev_char in 'ηιω')):
        return False
  # If none of the rules apply, diaeresis might be necessary.
  return True

 def lower_first_if_title(word):
  if word.isupper():
    return word
  elif word.istitle():
    return word[0].lower() + word[1:]
  else:
    return word

 def remove_greek_accents_from_upper(text):
  # Normalize to decomposed form (NFD)
  nfd_form = unicodedata.normalize('NFD', text)
  # Remove combining diacritical marks from capital letters, but keep dieresis marks
  result = []
  for c in nfd_form:
    if unicodedata.combining(c) and c != '̈':
      # Check if the base character is a capital letter
      base_char = nfd_form[nfd_form.index(c) - 1]
      if 'Α' <= base_char <= 'Ω':
        continue  # Skip combining marks for capital letters
    result.append(c)
  # Reconstruct the string
  return unicodedata.normalize('NFC', ''.join(result))

 def greek_vowels_iterator(word):
  vowels = 'αάεέηήιίοόυύωώϊϋΐΰ'
  diphthongs = {
    'αι', 'ει', 'οι', 'υι', 'αυ', 'ευ', 'ου',
    'αί', 'εί', 'οί', 'υί', 'αύ', 'εύ', 'ού',
    'αη', 'αϊ', 'οη', 'όη', 'οϊ', 'άι', 'όι', 'εϊ'
  }
  spurious_diphthongs = 'ιυ'
  spurious_diphthongs_long = {'οι', 'ει'}
  vowel_digraphs = {
    'αι', 'ει', 'οι', 'ου', 'υι',
    'αί', 'εί', 'οί', 'ού', 'υί'
  }
  qualifying_vowels = 'αοεάόέ'
  word_len = len(word)
  i = 0
  while i < word_len:
    start = i
    # Handle short spurious diphthongs ('ι', 'υ')
    if i < word_len - 1 and word[i] in spurious_diphthongs:
      # Check if the next two characters form a vowel digraph
      if i < word_len - 2 and word[i+1:i+3] in vowel_digraphs:
        i += 3
      # Check if the next character is a qualifying vowel
      elif word[i+1] in qualifying_vowels:
        i += 2
      else:
        i += 1
      yield word[start:i]
    # Handle long spurious diphthongs ('οι', 'ει')
    elif i < word_len - 2 and word[i:i+2] in spurious_diphthongs_long:
      if i < word_len - 3 and word[i+2:i+4] in vowel_digraphs:
        i += 4
      elif word[i+2] in qualifying_vowels:
        i += 3
      else:
        i += 2
      yield word[start:i]
    else:
      # Handle standard diphthongs
      if i < word_len - 1 and word[i:i+2] in diphthongs:
        i += 2
        yield word[start:i]
      # Handle single vowels
      elif word[i] in vowels:
        i += 1
        yield word[start:i]
      else:
        i += 1

 def validate_accented_word(word):
  """Validate if a Greek word is accented"""
  accented_vowels = 'άέήίόύώΐΰ'
  vowels = 'αάεέηήιίοόυύωώϊϋΐΰ'

  def is_accented(vowel):
    for ch in vowel:
      if ch in accented_vowels: return True
    return False

  # Check if the word is a contraction by checking its beginning/ending
  is_contraction_end = word[-1] in "'’"
  is_contraction = is_contraction_end or word[0] in "'’"
  # Handle exceptions for single-syllable words
  exceptions = {'ή', 'πού', 'πώς', 'µού', 'σού', 'τού', 'τήν',
                'τής', 'τόν', 'τό', 'µάς', 'σάς', 'τούς', 'τά'}
  vowel_components = list(greek_vowels_iterator(word))
  num_syllables = len(vowel_components)
  # If there's only one vowel, handle exceptions for single-syllable words
  if num_syllables == 1 and not is_contraction:
    return is_accented(vowel_components[0]) == bool(word in exceptions) # XNOR
  # An incorrect use of the compound vowels 'αυ', 'ευ'
  if word.find('άυ') >= 0 or word.find('έυ') >= 0:
    return False
  # Initialize variables to track accents
  accented_syllable_count = 0
  second_last_accent = False
  # Iterate over the vowels in the word
  for i, vowel in enumerate(vowel_components):
    # Check if the vowel is accented
    if is_accented(vowel):
      accented_syllable_count += 1
      # If an accent is found before the last three syllables, return False
      if i < num_syllables - 3 + int(is_contraction_end):
        return False
      if i == num_syllables - 2 + int(is_contraction_end):
        second_last_accent = True
  # Check the number of accented syllables in the last three syllables
  if accented_syllable_count == 3 - int(is_contraction):
    return False  # Three accented syllables found
  # Two accented syllables found, one in the second to last syllable
  elif accented_syllable_count == 2 and second_last_accent:
    return False
  # Check if no accent is found in the last three syllables
  elif not is_contraction and accented_syllable_count == 0:
    return False
  else:
    return True

 def is_greek_letter(char):
  return char == 'Ά' or 'Έ' <= char <= 'ώ'

 def is_greek_vowel(char):
  return char in 'αάεέηήιίοόυύωώϊϋΐΰς' # + 'ς'

 def is_punctuation(char):
  return char in '!(),.:;·–—…'

 def is_quotation_mark(char):
  return char in '«»'

 def is_apostrophe(char):
  return char in "'’"

 def is_part_of_word(char, next_char, current_word):
  if is_greek_letter(char):
    return True
  if is_apostrophe(char):
    if is_greek_letter(next_char) and not (current_word or \
        is_greek_vowel(next_char)):
      return True  # Start of a contracted word
    if current_word and is_greek_letter(current_word[-1]) and \
        not is_greek_vowel(current_word[-1]):
      return True
  return False

 class TrieNode:
  def __init__(self):
    self.children = {}
    self.is_end = False
    self.word = None

 class Trie:
  def __init__(self):
    self.root = TrieNode()

  def insert(self, word):
    node = self.root
    for char in word:
      if char not in node.children:
        node.children[char] = TrieNode()
      node = node.children[char]
    node.is_end = True
    node.word = word

  def find_longest_prefix(self, string):
    node = self.root
    last_match = None
    for i, char in enumerate(string):
      if char not in node.children:
        break
      node = node.children[char]
      if node.is_end:
        last_match = (i, node.word)
    return last_match

 greek_abbreviations = [
  'Απρ.', 'Αύγ.', 'Δεκ.', 'Δευτ.', 'Δρ.', 'Ιαν.', 'Ιούλ.', 'Ιούν.', 'Κυρ.',
  'Μάρτ.', 'Ν.Σ.', 'Πέμ.', 'Παρ.', 'Σάβ.', 'ΣτΕ.', 'ΣτΜ.', 'Τετ.', 'Τρ.',
  'Φ.', 'ά.τ.', 'ά.τ.χ.', 'ά.χ.', 'άγ.', 'άρ.', 'άρθρ.', 'έ.α.', 'έκδ.',
  'ένθ. άν(ωτ.).', 'έτ.', 'α.α.', 'α/α', 'αι.', 'ανάτ.', 'ανών.', 'αρ.',
  'αρ.φ.', 'αριθ.', 'αρχ.', 'αυτ.', 'β/θήκη', 'βιβλ.', 'βιβλγρ.', 'βλ.',
  'βλ. αν.', 'βλ. κατ.', 'δακτ.', 'δηλ.', 'διατρ.', 'εδ.', 'ειδ.', 'εικ.',
  'εικονγρ.', 'εισ.', 'εισαγ.', 'εκ.', 'εκδ.', 'εκκλ.', 'εκκλησ.', 'ελλ.',
  'ελλην.', 'εν.', 'ενικ.', 'εξ.', 'επ.', 'επιμ.', 'επόμ.', 'εφ.', 'εφημ.',
  'θρησκ.', 'κ.ά.', 'κ.α.', 'κ.εξ.', 'κ.επ.', 'κ.λ.π.', 'κ.λπ.', 'κ.ο.κ.',
  'κ.τ.λ.', 'κ.τ.τ.', 'κ.τ.ό.', 'καν.', 'κατάλ.', 'κεφ.', 'κλ.', 'κλπ.', 'κτλ.',
  'λ.', 'λ.χ.', 'λέξ.', 'λατ.', 'λατιν.', 'μ.', 'μ.Χ.', 'μ.ά.', 'μετ.',
  'μετφρ.', 'μτγ.', 'μτγν.', 'μτφ.', 'μτφρ.', 'μτχ.', 'μυθ.', 'νεοελλ.',
  'νεολ.', 'νεολατ.', 'νεολατιν.', 'νεότ.', 'ον.', 'ονομ.', 'π.Χ.', 'π.μ.',
  'π.χ.', 'πίν.', 'παρ.', 'περ.', 'πληθ.', 'πολγρ.', 'πρβ.', 'πρβλ.', 'πργρ.',
  'προφ.', 'πρότ.', 'πτ.', 'ρ.', 'ρήμ.', 'σ.', 'σ.π.', 'σεβ.', 'σελ.', 'σημ.',
  'σπ.', 'σπάν.', 'σσ.', 'στ.', 'στίχ.', 'στατ.', 'στχ.', 'συμπλ.', 'σχ.',
  'τ.', 'τ.μ.', 'τεύχ.', 'τιμ. τόμ.', 'τμ.', 'τυπ.', 'τόμ.', 'υποσ.', 'υποσημ.',
  'υποφ.', 'φ.', 'φάκ.', 'φιλολ.', 'φιλοσ.', 'χ.κ.', 'χ.σ.', 'χ.τ.', 'χ.τ.χ.',
  'χ.χ.', 'χ.ό.', 'χγφ.', 'χργρ.', 'χφ.', 'χφο.', 'χφφ.', 'χχφ.', 'ό,τι',
  'ό.π.', 'όμ.', 'όπ.π.', 'όπ.παρ.'
 ]

 def tokenize_greek_sentence(trie, sentence, match_abbrv=False):
  tokens = []
  current_token = ""
  i = 0
  while i < len(sentence):
    # Check for abbreviations
    matched_abbrv = trie.find_longest_prefix(sentence[i:]) if match_abbrv else None

    if matched_abbrv:
      # if current_token:
      #   tokens.append(current_token)
      #   current_token = ""
      tokens.append(matched_abbrv[1])
      i += matched_abbrv[0] + 1
    else:
      char = sentence[i]
      next_char = sentence[i + 1] if i + 1 < len(sentence) else ""
      if is_part_of_word(char, next_char, current_token):
        current_token += char
      else:
        if current_token:
          tokens.append(current_token)
          current_token = ""
        if is_quotation_mark(char) or is_punctuation(char):
          tokens.append(char)
        # elif not char.isspace():
        #   tokens.append(char)
      i += 1
  if current_token:
    tokens.append(current_token)
  return tokens

 def process_file(filename, dictionary_file, min_occurrences=1):
  word_counter = Counter()
  misfits = set()
  not_in_dictionary = set()
  # Build the trie
  trie = Trie()
  for word in greek_abbreviations:
    trie.insert(word)
  # Load dictionary
  with open(dictionary_file, 'r', encoding='utf-8') as f:
    dictionary = set(f.read().splitlines())
  # Process the gzipped file
  with gzip.open(filename, 'rt', encoding='utf-8') as file:
    for line in file:
      # Apply processing functions
      line = remove_greek_accents_from_upper(line)
      words = tokenize_greek_sentence(trie, line)
      first_word = True
      for word in words:
        word = lower_first_if_title(word)
        # if (not first_word and word.istitle()) or \
        #     not (validate_diaeresis_mark(word.lower()) and \
        #     validate_accented_word(word.lower())):
        #   # print('Validation error:', word)
        #   continue
        # if (len(word) <= 5 or len(list(greek_vowels_iterator(word))) <= 2) and \
        if word not in dictionary:
          not_in_dictionary.add(word)
          continue
        word_counter[word] += 1
        if first_word:
          first_word = False
  # Handle misfits
  for word, count in word_counter.copy().items():
    if count < min_occurrences:
      misfits.add(word)
      del word_counter[word]
  # Write results
  with open('misfits.txt', 'w', encoding='utf-8') as f:
    for word in sorted(misfits):
      f.write(f"{word}\n")
  with open('not_in_dictionary.txt', 'w', encoding='utf-8') as f:
    for word in sorted(not_in_dictionary):
      f.write(f"{word}\n")
  with open('word_frequency.json', 'w', encoding='utf-8') as f:
    json.dump(dict(word_counter), f, ensure_ascii=False, indent=2)
  return word_counter, misfits, not_in_dictionary

 if __name__ == "__main__":
  input_file = "el.txt.gz"
  dictionary_file = "el_GR.dic"
  word_counter, misfits, not_in_dictionary = process_file(input_file, dictionary_file)

  print(f"Processed {len(word_counter)} unique words")
  print(f"Found {len(misfits)} words with less than 50 occurrences")
  print(f"Found {len(not_in_dictionary)} words not in the dictionary")
	import gzip
	import json
	import unicodedata
	from collections import Counter

	def validate_diaeresis_mark(word):
	diaeresis_chars = 'ϊϋΐΰ'
	vowels_with_accent = 'άέήίόύώ'
	diphthongs = {'αι', 'ει', 'οι', 'υι', 'αυ', 'ευ', 'ου'}
	diaeresis_found = False
	for i, char in enumerate(word):
	if char in diaeresis_chars:
	if diaeresis_found:
	return False
	else: diaeresis_found = True
	prev_char = word[i-1]
	# Rule 1: Check if the preceding vowel is stressed.
	if i >= 1 and prev_char in vowels_with_accent:
	return False
	# Rule 2: Check if there's a diphthong before the diaeresis.
	# (Accented diphthongs are caught by the previous rule.)
	if i >= 2 and word[i-2:i] in diphthongs:
	return False
	# Rule 3: Check if there's a non-diphthong in the word.
	# Only consider vowel combinations ending in ι or υ (e.g., ηυ, ιυ, ωυ, ηι, ωι).
	if i >= 1 and ((char in 'ϊΐ' and prev_char in 'ηω') or \
	(char in 'ϋΰ' and prev_char in 'ηιω')):
	return False
	# If none of the rules apply, diaeresis might be necessary.
	return True

	def lower_first_if_title(word):
	if word.isupper():
	return word
	elif word.istitle():
	return word[0].lower() + word[1:]
	else:
	return word

	def remove_greek_accents_from_upper(text):
	# Normalize to decomposed form (NFD)
	nfd_form = unicodedata.normalize('NFD', text)
	# Remove combining diacritical marks from capital letters, but keep dieresis marks
	result = []
	for c in nfd_form:
	if unicodedata.combining(c) and c != '̈':
	# Check if the base character is a capital letter
	base_char = nfd_form[nfd_form.index(c) - 1]
	if 'Α' <= base_char <= 'Ω':
	continue # Skip combining marks for capital letters
	result.append(c)
	# Reconstruct the string
	return unicodedata.normalize('NFC', ''.join(result))

	def greek_vowels_iterator(word):
	vowels = 'αάεέηήιίοόυύωώϊϋΐΰ'
	diphthongs = {
	'αι', 'ει', 'οι', 'υι', 'αυ', 'ευ', 'ου',
	'αί', 'εί', 'οί', 'υί', 'αύ', 'εύ', 'ού',
	'αη', 'αϊ', 'οη', 'όη', 'οϊ', 'άι', 'όι', 'εϊ'
	}
	spurious_diphthongs = 'ιυ'
	spurious_diphthongs_long = {'οι', 'ει'}
	vowel_digraphs = {
	'αι', 'ει', 'οι', 'ου', 'υι',
	'αί', 'εί', 'οί', 'ού', 'υί'
	}
	qualifying_vowels = 'αοεάόέ'
	word_len = len(word)
	i = 0
	while i < word_len:
	start = i
	# Handle short spurious diphthongs ('ι', 'υ')
	if i < word_len - 1 and word[i] in spurious_diphthongs:
	# Check if the next two characters form a vowel digraph
	if i < word_len - 2 and word[i+1:i+3] in vowel_digraphs:
	i += 3
	# Check if the next character is a qualifying vowel
	elif word[i+1] in qualifying_vowels:
	i += 2
	else:
	i += 1
	yield word[start:i]
	# Handle long spurious diphthongs ('οι', 'ει')
	elif i < word_len - 2 and word[i:i+2] in spurious_diphthongs_long:
	if i < word_len - 3 and word[i+2:i+4] in vowel_digraphs:
	i += 4
	elif word[i+2] in qualifying_vowels:
	i += 3
	else:
	i += 2
	yield word[start:i]
	else:
	# Handle standard diphthongs
	if i < word_len - 1 and word[i:i+2] in diphthongs:
	i += 2
	yield word[start:i]
	# Handle single vowels
	elif word[i] in vowels:
	i += 1
	yield word[start:i]
	else:
	i += 1

	def validate_accented_word(word):
	"""Validate if a Greek word is accented"""
	accented_vowels = 'άέήίόύώΐΰ'
	vowels = 'αάεέηήιίοόυύωώϊϋΐΰ'

	def is_accented(vowel):
	for ch in vowel:
	if ch in accented_vowels: return True
	return False

	# Check if the word is a contraction by checking its beginning/ending
	is_contraction_end = word[-1] in "'’"
	is_contraction = is_contraction_end or word[0] in "'’"
	# Handle exceptions for single-syllable words
	exceptions = {'ή', 'πού', 'πώς', 'µού', 'σού', 'τού', 'τήν',
	'τής', 'τόν', 'τό', 'µάς', 'σάς', 'τούς', 'τά'}
	vowel_components = list(greek_vowels_iterator(word))
	num_syllables = len(vowel_components)
	# If there's only one vowel, handle exceptions for single-syllable words
	if num_syllables == 1 and not is_contraction:
	return is_accented(vowel_components[0]) == bool(word in exceptions) # XNOR
	# An incorrect use of the compound vowels 'αυ', 'ευ'
	if word.find('άυ') >= 0 or word.find('έυ') >= 0:
	return False
	# Initialize variables to track accents
	accented_syllable_count = 0
	second_last_accent = False
	# Iterate over the vowels in the word
	for i, vowel in enumerate(vowel_components):
	# Check if the vowel is accented
	if is_accented(vowel):
	accented_syllable_count += 1
	# If an accent is found before the last three syllables, return False
	if i < num_syllables - 3 + int(is_contraction_end):
	return False
	if i == num_syllables - 2 + int(is_contraction_end):
	second_last_accent = True
	# Check the number of accented syllables in the last three syllables
	if accented_syllable_count == 3 - int(is_contraction):
	return False # Three accented syllables found
	# Two accented syllables found, one in the second to last syllable
	elif accented_syllable_count == 2 and second_last_accent:
	return False
	# Check if no accent is found in the last three syllables
	elif not is_contraction and accented_syllable_count == 0:
	return False
	else:
	return True

	def is_greek_letter(char):
	return char == 'Ά' or 'Έ' <= char <= 'ώ'

	def is_greek_vowel(char):
	return char in 'αάεέηήιίοόυύωώϊϋΐΰς' # + 'ς'

	def is_punctuation(char):
	return char in '!(),.:;·–—…'

	def is_quotation_mark(char):
	return char in '«»'

	def is_apostrophe(char):
	return char in "'’"

	def is_part_of_word(char, next_char, current_word):
	if is_greek_letter(char):
	return True
	if is_apostrophe(char):
	if is_greek_letter(next_char) and not (current_word or \
	is_greek_vowel(next_char)):
	return True # Start of a contracted word
	if current_word and is_greek_letter(current_word[-1]) and \
	not is_greek_vowel(current_word[-1]):
	return True
	return False

	class TrieNode:
	def __init__(self):
	self.children = {}
	self.is_end = False
	self.word = None

	class Trie:
	def __init__(self):
	self.root = TrieNode()

	def insert(self, word):
	node = self.root
	for char in word:
	if char not in node.children:
	node.children[char] = TrieNode()
	node = node.children[char]
	node.is_end = True
	node.word = word

	def find_longest_prefix(self, string):
	node = self.root
	last_match = None
	for i, char in enumerate(string):
	if char not in node.children:
	break
	node = node.children[char]
	if node.is_end:
	last_match = (i, node.word)
	return last_match

	greek_abbreviations = [
	'Απρ.', 'Αύγ.', 'Δεκ.', 'Δευτ.', 'Δρ.', 'Ιαν.', 'Ιούλ.', 'Ιούν.', 'Κυρ.',
	'Μάρτ.', 'Ν.Σ.', 'Πέμ.', 'Παρ.', 'Σάβ.', 'ΣτΕ.', 'ΣτΜ.', 'Τετ.', 'Τρ.',
	'Φ.', 'ά.τ.', 'ά.τ.χ.', 'ά.χ.', 'άγ.', 'άρ.', 'άρθρ.', 'έ.α.', 'έκδ.',
	'ένθ. άν(ωτ.).', 'έτ.', 'α.α.', 'α/α', 'αι.', 'ανάτ.', 'ανών.', 'αρ.',
	'αρ.φ.', 'αριθ.', 'αρχ.', 'αυτ.', 'β/θήκη', 'βιβλ.', 'βιβλγρ.', 'βλ.',
	'βλ. αν.', 'βλ. κατ.', 'δακτ.', 'δηλ.', 'διατρ.', 'εδ.', 'ειδ.', 'εικ.',
	'εικονγρ.', 'εισ.', 'εισαγ.', 'εκ.', 'εκδ.', 'εκκλ.', 'εκκλησ.', 'ελλ.',
	'ελλην.', 'εν.', 'ενικ.', 'εξ.', 'επ.', 'επιμ.', 'επόμ.', 'εφ.', 'εφημ.',
	'θρησκ.', 'κ.ά.', 'κ.α.', 'κ.εξ.', 'κ.επ.', 'κ.λ.π.', 'κ.λπ.', 'κ.ο.κ.',
	'κ.τ.λ.', 'κ.τ.τ.', 'κ.τ.ό.', 'καν.', 'κατάλ.', 'κεφ.', 'κλ.', 'κλπ.', 'κτλ.',
	'λ.', 'λ.χ.', 'λέξ.', 'λατ.', 'λατιν.', 'μ.', 'μ.Χ.', 'μ.ά.', 'μετ.',
	'μετφρ.', 'μτγ.', 'μτγν.', 'μτφ.', 'μτφρ.', 'μτχ.', 'μυθ.', 'νεοελλ.',
	'νεολ.', 'νεολατ.', 'νεολατιν.', 'νεότ.', 'ον.', 'ονομ.', 'π.Χ.', 'π.μ.',
	'π.χ.', 'πίν.', 'παρ.', 'περ.', 'πληθ.', 'πολγρ.', 'πρβ.', 'πρβλ.', 'πργρ.',
	'προφ.', 'πρότ.', 'πτ.', 'ρ.', 'ρήμ.', 'σ.', 'σ.π.', 'σεβ.', 'σελ.', 'σημ.',
	'σπ.', 'σπάν.', 'σσ.', 'στ.', 'στίχ.', 'στατ.', 'στχ.', 'συμπλ.', 'σχ.',
	'τ.', 'τ.μ.', 'τεύχ.', 'τιμ. τόμ.', 'τμ.', 'τυπ.', 'τόμ.', 'υποσ.', 'υποσημ.',
	'υποφ.', 'φ.', 'φάκ.', 'φιλολ.', 'φιλοσ.', 'χ.κ.', 'χ.σ.', 'χ.τ.', 'χ.τ.χ.',
	'χ.χ.', 'χ.ό.', 'χγφ.', 'χργρ.', 'χφ.', 'χφο.', 'χφφ.', 'χχφ.', 'ό,τι',
	'ό.π.', 'όμ.', 'όπ.π.', 'όπ.παρ.'
	]

	def tokenize_greek_sentence(trie, sentence, match_abbrv=False):
	tokens = []
	current_token = ""
	i = 0
	while i < len(sentence):
	# Check for abbreviations
	matched_abbrv = trie.find_longest_prefix(sentence[i:]) if match_abbrv else None

	if matched_abbrv:
	# if current_token:
	# tokens.append(current_token)
	# current_token = ""
	tokens.append(matched_abbrv[1])
	i += matched_abbrv[0] + 1
	else:
	char = sentence[i]
	next_char = sentence[i + 1] if i + 1 < len(sentence) else ""
	if is_part_of_word(char, next_char, current_token):
	current_token += char
	else:
	if current_token:
	tokens.append(current_token)
	current_token = ""
	if is_quotation_mark(char) or is_punctuation(char):
	tokens.append(char)
	# elif not char.isspace():
	# tokens.append(char)
	i += 1
	if current_token:
	tokens.append(current_token)
	return tokens

	def process_file(filename, dictionary_file, min_occurrences=1):
	word_counter = Counter()
	misfits = set()
	not_in_dictionary = set()
	# Build the trie
	trie = Trie()
	for word in greek_abbreviations:
	trie.insert(word)
	# Load dictionary
	with open(dictionary_file, 'r', encoding='utf-8') as f:
	dictionary = set(f.read().splitlines())
	# Process the gzipped file
	with gzip.open(filename, 'rt', encoding='utf-8') as file:
	for line in file:
	# Apply processing functions
	line = remove_greek_accents_from_upper(line)
	words = tokenize_greek_sentence(trie, line)
	first_word = True
	for word in words:
	word = lower_first_if_title(word)
	# if (not first_word and word.istitle()) or \
	# not (validate_diaeresis_mark(word.lower()) and \
	# validate_accented_word(word.lower())):
	# # print('Validation error:', word)
	# continue
	# if (len(word) <= 5 or len(list(greek_vowels_iterator(word))) <= 2) and \
	if word not in dictionary:
	not_in_dictionary.add(word)
	continue
	word_counter[word] += 1
	if first_word:
	first_word = False
	# Handle misfits
	for word, count in word_counter.copy().items():
	if count < min_occurrences:
	misfits.add(word)
	del word_counter[word]
	# Write results
	with open('misfits.txt', 'w', encoding='utf-8') as f:
	for word in sorted(misfits):
	f.write(f"{word}\n")
	with open('not_in_dictionary.txt', 'w', encoding='utf-8') as f:
	for word in sorted(not_in_dictionary):
	f.write(f"{word}\n")
	with open('word_frequency.json', 'w', encoding='utf-8') as f:
	json.dump(dict(word_counter), f, ensure_ascii=False, indent=2)
	return word_counter, misfits, not_in_dictionary

	if __name__ == "__main__":
	input_file = "el.txt.gz"
	dictionary_file = "el_GR.dic"
	word_counter, misfits, not_in_dictionary = process_file(input_file, dictionary_file)

	print(f"Processed {len(word_counter)} unique words")
	print(f"Found {len(misfits)} words with less than 50 occurrences")
	print(f"Found {len(not_in_dictionary)} words not in the dictionary")