Skip to content

Instantly share code, notes, and snippets.

@planetis-m
Created October 3, 2024 12:59
Show Gist options
  • Save planetis-m/d52aff3c3739757e348f7cb71782fc14 to your computer and use it in GitHub Desktop.
Save planetis-m/d52aff3c3739757e348f7cb71782fc14 to your computer and use it in GitHub Desktop.
import gzip
import json
import unicodedata
from collections import Counter
def validate_diaeresis_mark(word):
diaeresis_chars = 'ϊϋΐΰ'
vowels_with_accent = 'άέήίόύώ'
diphthongs = {'αι', 'ει', 'οι', 'υι', 'αυ', 'ευ', 'ου'}
diaeresis_found = False
for i, char in enumerate(word):
if char in diaeresis_chars:
if diaeresis_found:
return False
else: diaeresis_found = True
prev_char = word[i-1]
# Rule 1: Check if the preceding vowel is stressed.
if i >= 1 and prev_char in vowels_with_accent:
return False
# Rule 2: Check if there's a diphthong before the diaeresis.
# (Accented diphthongs are caught by the previous rule.)
if i >= 2 and word[i-2:i] in diphthongs:
return False
# Rule 3: Check if there's a non-diphthong in the word.
# Only consider vowel combinations ending in ι or υ (e.g., ηυ, ιυ, ωυ, ηι, ωι).
if i >= 1 and ((char in 'ϊΐ' and prev_char in 'ηω') or \
(char in 'ϋΰ' and prev_char in 'ηιω')):
return False
# If none of the rules apply, diaeresis might be necessary.
return True
def lower_first_if_title(word):
if word.isupper():
return word
elif word.istitle():
return word[0].lower() + word[1:]
else:
return word
def remove_greek_accents_from_upper(text):
# Normalize to decomposed form (NFD)
nfd_form = unicodedata.normalize('NFD', text)
# Remove combining diacritical marks from capital letters, but keep dieresis marks
result = []
for c in nfd_form:
if unicodedata.combining(c) and c != '̈':
# Check if the base character is a capital letter
base_char = nfd_form[nfd_form.index(c) - 1]
if 'Α' <= base_char <= 'Ω':
continue # Skip combining marks for capital letters
result.append(c)
# Reconstruct the string
return unicodedata.normalize('NFC', ''.join(result))
def greek_vowels_iterator(word):
vowels = 'αάεέηήιίοόυύωώϊϋΐΰ'
diphthongs = {
'αι', 'ει', 'οι', 'υι', 'αυ', 'ευ', 'ου',
'αί', 'εί', 'οί', 'υί', 'αύ', 'εύ', 'ού',
'αη', 'αϊ', 'οη', 'όη', 'οϊ', 'άι', 'όι', 'εϊ'
}
spurious_diphthongs = 'ιυ'
spurious_diphthongs_long = {'οι', 'ει'}
vowel_digraphs = {
'αι', 'ει', 'οι', 'ου', 'υι',
'αί', 'εί', 'οί', 'ού', 'υί'
}
qualifying_vowels = 'αοεάόέ'
word_len = len(word)
i = 0
while i < word_len:
start = i
# Handle short spurious diphthongs ('ι', 'υ')
if i < word_len - 1 and word[i] in spurious_diphthongs:
# Check if the next two characters form a vowel digraph
if i < word_len - 2 and word[i+1:i+3] in vowel_digraphs:
i += 3
# Check if the next character is a qualifying vowel
elif word[i+1] in qualifying_vowels:
i += 2
else:
i += 1
yield word[start:i]
# Handle long spurious diphthongs ('οι', 'ει')
elif i < word_len - 2 and word[i:i+2] in spurious_diphthongs_long:
if i < word_len - 3 and word[i+2:i+4] in vowel_digraphs:
i += 4
elif word[i+2] in qualifying_vowels:
i += 3
else:
i += 2
yield word[start:i]
else:
# Handle standard diphthongs
if i < word_len - 1 and word[i:i+2] in diphthongs:
i += 2
yield word[start:i]
# Handle single vowels
elif word[i] in vowels:
i += 1
yield word[start:i]
else:
i += 1
def validate_accented_word(word):
"""Validate if a Greek word is accented"""
accented_vowels = 'άέήίόύώΐΰ'
vowels = 'αάεέηήιίοόυύωώϊϋΐΰ'
def is_accented(vowel):
for ch in vowel:
if ch in accented_vowels: return True
return False
# Check if the word is a contraction by checking its beginning/ending
is_contraction_end = word[-1] in "'’"
is_contraction = is_contraction_end or word[0] in "'’"
# Handle exceptions for single-syllable words
exceptions = {'ή', 'πού', 'πώς', 'µού', 'σού', 'τού', 'τήν',
'τής', 'τόν', 'τό', 'µάς', 'σάς', 'τούς', 'τά'}
vowel_components = list(greek_vowels_iterator(word))
num_syllables = len(vowel_components)
# If there's only one vowel, handle exceptions for single-syllable words
if num_syllables == 1 and not is_contraction:
return is_accented(vowel_components[0]) == bool(word in exceptions) # XNOR
# An incorrect use of the compound vowels 'αυ', 'ευ'
if word.find('άυ') >= 0 or word.find('έυ') >= 0:
return False
# Initialize variables to track accents
accented_syllable_count = 0
second_last_accent = False
# Iterate over the vowels in the word
for i, vowel in enumerate(vowel_components):
# Check if the vowel is accented
if is_accented(vowel):
accented_syllable_count += 1
# If an accent is found before the last three syllables, return False
if i < num_syllables - 3 + int(is_contraction_end):
return False
if i == num_syllables - 2 + int(is_contraction_end):
second_last_accent = True
# Check the number of accented syllables in the last three syllables
if accented_syllable_count == 3 - int(is_contraction):
return False # Three accented syllables found
# Two accented syllables found, one in the second to last syllable
elif accented_syllable_count == 2 and second_last_accent:
return False
# Check if no accent is found in the last three syllables
elif not is_contraction and accented_syllable_count == 0:
return False
else:
return True
def is_greek_letter(char):
return char == 'Ά' or 'Έ' <= char <= 'ώ'
def is_greek_vowel(char):
return char in 'αάεέηήιίοόυύωώϊϋΐΰς' # + 'ς'
def is_punctuation(char):
return char in '!(),.:;·–—…'
def is_quotation_mark(char):
return char in '«»'
def is_apostrophe(char):
return char in "'’"
def is_part_of_word(char, next_char, current_word):
if is_greek_letter(char):
return True
if is_apostrophe(char):
if is_greek_letter(next_char) and not (current_word or \
is_greek_vowel(next_char)):
return True # Start of a contracted word
if current_word and is_greek_letter(current_word[-1]) and \
not is_greek_vowel(current_word[-1]):
return True
return False
class TrieNode:
def __init__(self):
self.children = {}
self.is_end = False
self.word = None
class Trie:
def __init__(self):
self.root = TrieNode()
def insert(self, word):
node = self.root
for char in word:
if char not in node.children:
node.children[char] = TrieNode()
node = node.children[char]
node.is_end = True
node.word = word
def find_longest_prefix(self, string):
node = self.root
last_match = None
for i, char in enumerate(string):
if char not in node.children:
break
node = node.children[char]
if node.is_end:
last_match = (i, node.word)
return last_match
greek_abbreviations = [
'Απρ.', 'Αύγ.', 'Δεκ.', 'Δευτ.', 'Δρ.', 'Ιαν.', 'Ιούλ.', 'Ιούν.', 'Κυρ.',
'Μάρτ.', 'Ν.Σ.', 'Πέμ.', 'Παρ.', 'Σάβ.', 'ΣτΕ.', 'ΣτΜ.', 'Τετ.', 'Τρ.',
'Φ.', 'ά.τ.', 'ά.τ.χ.', 'ά.χ.', 'άγ.', 'άρ.', 'άρθρ.', 'έ.α.', 'έκδ.',
'ένθ. άν(ωτ.).', 'έτ.', 'α.α.', 'α/α', 'αι.', 'ανάτ.', 'ανών.', 'αρ.',
'αρ.φ.', 'αριθ.', 'αρχ.', 'αυτ.', 'β/θήκη', 'βιβλ.', 'βιβλγρ.', 'βλ.',
'βλ. αν.', 'βλ. κατ.', 'δακτ.', 'δηλ.', 'διατρ.', 'εδ.', 'ειδ.', 'εικ.',
'εικονγρ.', 'εισ.', 'εισαγ.', 'εκ.', 'εκδ.', 'εκκλ.', 'εκκλησ.', 'ελλ.',
'ελλην.', 'εν.', 'ενικ.', 'εξ.', 'επ.', 'επιμ.', 'επόμ.', 'εφ.', 'εφημ.',
'θρησκ.', 'κ.ά.', 'κ.α.', 'κ.εξ.', 'κ.επ.', 'κ.λ.π.', 'κ.λπ.', 'κ.ο.κ.',
'κ.τ.λ.', 'κ.τ.τ.', 'κ.τ.ό.', 'καν.', 'κατάλ.', 'κεφ.', 'κλ.', 'κλπ.', 'κτλ.',
'λ.', 'λ.χ.', 'λέξ.', 'λατ.', 'λατιν.', 'μ.', 'μ.Χ.', 'μ.ά.', 'μετ.',
'μετφρ.', 'μτγ.', 'μτγν.', 'μτφ.', 'μτφρ.', 'μτχ.', 'μυθ.', 'νεοελλ.',
'νεολ.', 'νεολατ.', 'νεολατιν.', 'νεότ.', 'ον.', 'ονομ.', 'π.Χ.', 'π.μ.',
'π.χ.', 'πίν.', 'παρ.', 'περ.', 'πληθ.', 'πολγρ.', 'πρβ.', 'πρβλ.', 'πργρ.',
'προφ.', 'πρότ.', 'πτ.', 'ρ.', 'ρήμ.', 'σ.', 'σ.π.', 'σεβ.', 'σελ.', 'σημ.',
'σπ.', 'σπάν.', 'σσ.', 'στ.', 'στίχ.', 'στατ.', 'στχ.', 'συμπλ.', 'σχ.',
'τ.', 'τ.μ.', 'τεύχ.', 'τιμ. τόμ.', 'τμ.', 'τυπ.', 'τόμ.', 'υποσ.', 'υποσημ.',
'υποφ.', 'φ.', 'φάκ.', 'φιλολ.', 'φιλοσ.', 'χ.κ.', 'χ.σ.', 'χ.τ.', 'χ.τ.χ.',
'χ.χ.', 'χ.ό.', 'χγφ.', 'χργρ.', 'χφ.', 'χφο.', 'χφφ.', 'χχφ.', 'ό,τι',
'ό.π.', 'όμ.', 'όπ.π.', 'όπ.παρ.'
]
def tokenize_greek_sentence(trie, sentence, match_abbrv=False):
tokens = []
current_token = ""
i = 0
while i < len(sentence):
# Check for abbreviations
matched_abbrv = trie.find_longest_prefix(sentence[i:]) if match_abbrv else None
if matched_abbrv:
# if current_token:
# tokens.append(current_token)
# current_token = ""
tokens.append(matched_abbrv[1])
i += matched_abbrv[0] + 1
else:
char = sentence[i]
next_char = sentence[i + 1] if i + 1 < len(sentence) else ""
if is_part_of_word(char, next_char, current_token):
current_token += char
else:
if current_token:
tokens.append(current_token)
current_token = ""
if is_quotation_mark(char) or is_punctuation(char):
tokens.append(char)
# elif not char.isspace():
# tokens.append(char)
i += 1
if current_token:
tokens.append(current_token)
return tokens
def process_file(filename, dictionary_file, min_occurrences=1):
word_counter = Counter()
misfits = set()
not_in_dictionary = set()
# Build the trie
trie = Trie()
for word in greek_abbreviations:
trie.insert(word)
# Load dictionary
with open(dictionary_file, 'r', encoding='utf-8') as f:
dictionary = set(f.read().splitlines())
# Process the gzipped file
with gzip.open(filename, 'rt', encoding='utf-8') as file:
for line in file:
# Apply processing functions
line = remove_greek_accents_from_upper(line)
words = tokenize_greek_sentence(trie, line)
first_word = True
for word in words:
word = lower_first_if_title(word)
# if (not first_word and word.istitle()) or \
# not (validate_diaeresis_mark(word.lower()) and \
# validate_accented_word(word.lower())):
# # print('Validation error:', word)
# continue
# if (len(word) <= 5 or len(list(greek_vowels_iterator(word))) <= 2) and \
if word not in dictionary:
not_in_dictionary.add(word)
continue
word_counter[word] += 1
if first_word:
first_word = False
# Handle misfits
for word, count in word_counter.copy().items():
if count < min_occurrences:
misfits.add(word)
del word_counter[word]
# Write results
with open('misfits.txt', 'w', encoding='utf-8') as f:
for word in sorted(misfits):
f.write(f"{word}\n")
with open('not_in_dictionary.txt', 'w', encoding='utf-8') as f:
for word in sorted(not_in_dictionary):
f.write(f"{word}\n")
with open('word_frequency.json', 'w', encoding='utf-8') as f:
json.dump(dict(word_counter), f, ensure_ascii=False, indent=2)
return word_counter, misfits, not_in_dictionary
if __name__ == "__main__":
input_file = "el.txt.gz"
dictionary_file = "el_GR.dic"
word_counter, misfits, not_in_dictionary = process_file(input_file, dictionary_file)
print(f"Processed {len(word_counter)} unique words")
print(f"Found {len(misfits)} words with less than 50 occurrences")
print(f"Found {len(not_in_dictionary)} words not in the dictionary")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment