Created
October 3, 2024 12:59
-
-
Save planetis-m/d52aff3c3739757e348f7cb71782fc14 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import gzip | |
import json | |
import unicodedata | |
from collections import Counter | |
def validate_diaeresis_mark(word): | |
diaeresis_chars = 'ϊϋΐΰ' | |
vowels_with_accent = 'άέήίόύώ' | |
diphthongs = {'αι', 'ει', 'οι', 'υι', 'αυ', 'ευ', 'ου'} | |
diaeresis_found = False | |
for i, char in enumerate(word): | |
if char in diaeresis_chars: | |
if diaeresis_found: | |
return False | |
else: diaeresis_found = True | |
prev_char = word[i-1] | |
# Rule 1: Check if the preceding vowel is stressed. | |
if i >= 1 and prev_char in vowels_with_accent: | |
return False | |
# Rule 2: Check if there's a diphthong before the diaeresis. | |
# (Accented diphthongs are caught by the previous rule.) | |
if i >= 2 and word[i-2:i] in diphthongs: | |
return False | |
# Rule 3: Check if there's a non-diphthong in the word. | |
# Only consider vowel combinations ending in ι or υ (e.g., ηυ, ιυ, ωυ, ηι, ωι). | |
if i >= 1 and ((char in 'ϊΐ' and prev_char in 'ηω') or \ | |
(char in 'ϋΰ' and prev_char in 'ηιω')): | |
return False | |
# If none of the rules apply, diaeresis might be necessary. | |
return True | |
def lower_first_if_title(word): | |
if word.isupper(): | |
return word | |
elif word.istitle(): | |
return word[0].lower() + word[1:] | |
else: | |
return word | |
def remove_greek_accents_from_upper(text): | |
# Normalize to decomposed form (NFD) | |
nfd_form = unicodedata.normalize('NFD', text) | |
# Remove combining diacritical marks from capital letters, but keep dieresis marks | |
result = [] | |
for c in nfd_form: | |
if unicodedata.combining(c) and c != '̈': | |
# Check if the base character is a capital letter | |
base_char = nfd_form[nfd_form.index(c) - 1] | |
if 'Α' <= base_char <= 'Ω': | |
continue # Skip combining marks for capital letters | |
result.append(c) | |
# Reconstruct the string | |
return unicodedata.normalize('NFC', ''.join(result)) | |
def greek_vowels_iterator(word): | |
vowels = 'αάεέηήιίοόυύωώϊϋΐΰ' | |
diphthongs = { | |
'αι', 'ει', 'οι', 'υι', 'αυ', 'ευ', 'ου', | |
'αί', 'εί', 'οί', 'υί', 'αύ', 'εύ', 'ού', | |
'αη', 'αϊ', 'οη', 'όη', 'οϊ', 'άι', 'όι', 'εϊ' | |
} | |
spurious_diphthongs = 'ιυ' | |
spurious_diphthongs_long = {'οι', 'ει'} | |
vowel_digraphs = { | |
'αι', 'ει', 'οι', 'ου', 'υι', | |
'αί', 'εί', 'οί', 'ού', 'υί' | |
} | |
qualifying_vowels = 'αοεάόέ' | |
word_len = len(word) | |
i = 0 | |
while i < word_len: | |
start = i | |
# Handle short spurious diphthongs ('ι', 'υ') | |
if i < word_len - 1 and word[i] in spurious_diphthongs: | |
# Check if the next two characters form a vowel digraph | |
if i < word_len - 2 and word[i+1:i+3] in vowel_digraphs: | |
i += 3 | |
# Check if the next character is a qualifying vowel | |
elif word[i+1] in qualifying_vowels: | |
i += 2 | |
else: | |
i += 1 | |
yield word[start:i] | |
# Handle long spurious diphthongs ('οι', 'ει') | |
elif i < word_len - 2 and word[i:i+2] in spurious_diphthongs_long: | |
if i < word_len - 3 and word[i+2:i+4] in vowel_digraphs: | |
i += 4 | |
elif word[i+2] in qualifying_vowels: | |
i += 3 | |
else: | |
i += 2 | |
yield word[start:i] | |
else: | |
# Handle standard diphthongs | |
if i < word_len - 1 and word[i:i+2] in diphthongs: | |
i += 2 | |
yield word[start:i] | |
# Handle single vowels | |
elif word[i] in vowels: | |
i += 1 | |
yield word[start:i] | |
else: | |
i += 1 | |
def validate_accented_word(word): | |
"""Validate if a Greek word is accented""" | |
accented_vowels = 'άέήίόύώΐΰ' | |
vowels = 'αάεέηήιίοόυύωώϊϋΐΰ' | |
def is_accented(vowel): | |
for ch in vowel: | |
if ch in accented_vowels: return True | |
return False | |
# Check if the word is a contraction by checking its beginning/ending | |
is_contraction_end = word[-1] in "'’" | |
is_contraction = is_contraction_end or word[0] in "'’" | |
# Handle exceptions for single-syllable words | |
exceptions = {'ή', 'πού', 'πώς', 'µού', 'σού', 'τού', 'τήν', | |
'τής', 'τόν', 'τό', 'µάς', 'σάς', 'τούς', 'τά'} | |
vowel_components = list(greek_vowels_iterator(word)) | |
num_syllables = len(vowel_components) | |
# If there's only one vowel, handle exceptions for single-syllable words | |
if num_syllables == 1 and not is_contraction: | |
return is_accented(vowel_components[0]) == bool(word in exceptions) # XNOR | |
# An incorrect use of the compound vowels 'αυ', 'ευ' | |
if word.find('άυ') >= 0 or word.find('έυ') >= 0: | |
return False | |
# Initialize variables to track accents | |
accented_syllable_count = 0 | |
second_last_accent = False | |
# Iterate over the vowels in the word | |
for i, vowel in enumerate(vowel_components): | |
# Check if the vowel is accented | |
if is_accented(vowel): | |
accented_syllable_count += 1 | |
# If an accent is found before the last three syllables, return False | |
if i < num_syllables - 3 + int(is_contraction_end): | |
return False | |
if i == num_syllables - 2 + int(is_contraction_end): | |
second_last_accent = True | |
# Check the number of accented syllables in the last three syllables | |
if accented_syllable_count == 3 - int(is_contraction): | |
return False # Three accented syllables found | |
# Two accented syllables found, one in the second to last syllable | |
elif accented_syllable_count == 2 and second_last_accent: | |
return False | |
# Check if no accent is found in the last three syllables | |
elif not is_contraction and accented_syllable_count == 0: | |
return False | |
else: | |
return True | |
def is_greek_letter(char): | |
return char == 'Ά' or 'Έ' <= char <= 'ώ' | |
def is_greek_vowel(char): | |
return char in 'αάεέηήιίοόυύωώϊϋΐΰς' # + 'ς' | |
def is_punctuation(char): | |
return char in '!(),.:;·–—…' | |
def is_quotation_mark(char): | |
return char in '«»' | |
def is_apostrophe(char): | |
return char in "'’" | |
def is_part_of_word(char, next_char, current_word): | |
if is_greek_letter(char): | |
return True | |
if is_apostrophe(char): | |
if is_greek_letter(next_char) and not (current_word or \ | |
is_greek_vowel(next_char)): | |
return True # Start of a contracted word | |
if current_word and is_greek_letter(current_word[-1]) and \ | |
not is_greek_vowel(current_word[-1]): | |
return True | |
return False | |
class TrieNode: | |
def __init__(self): | |
self.children = {} | |
self.is_end = False | |
self.word = None | |
class Trie: | |
def __init__(self): | |
self.root = TrieNode() | |
def insert(self, word): | |
node = self.root | |
for char in word: | |
if char not in node.children: | |
node.children[char] = TrieNode() | |
node = node.children[char] | |
node.is_end = True | |
node.word = word | |
def find_longest_prefix(self, string): | |
node = self.root | |
last_match = None | |
for i, char in enumerate(string): | |
if char not in node.children: | |
break | |
node = node.children[char] | |
if node.is_end: | |
last_match = (i, node.word) | |
return last_match | |
greek_abbreviations = [ | |
'Απρ.', 'Αύγ.', 'Δεκ.', 'Δευτ.', 'Δρ.', 'Ιαν.', 'Ιούλ.', 'Ιούν.', 'Κυρ.', | |
'Μάρτ.', 'Ν.Σ.', 'Πέμ.', 'Παρ.', 'Σάβ.', 'ΣτΕ.', 'ΣτΜ.', 'Τετ.', 'Τρ.', | |
'Φ.', 'ά.τ.', 'ά.τ.χ.', 'ά.χ.', 'άγ.', 'άρ.', 'άρθρ.', 'έ.α.', 'έκδ.', | |
'ένθ. άν(ωτ.).', 'έτ.', 'α.α.', 'α/α', 'αι.', 'ανάτ.', 'ανών.', 'αρ.', | |
'αρ.φ.', 'αριθ.', 'αρχ.', 'αυτ.', 'β/θήκη', 'βιβλ.', 'βιβλγρ.', 'βλ.', | |
'βλ. αν.', 'βλ. κατ.', 'δακτ.', 'δηλ.', 'διατρ.', 'εδ.', 'ειδ.', 'εικ.', | |
'εικονγρ.', 'εισ.', 'εισαγ.', 'εκ.', 'εκδ.', 'εκκλ.', 'εκκλησ.', 'ελλ.', | |
'ελλην.', 'εν.', 'ενικ.', 'εξ.', 'επ.', 'επιμ.', 'επόμ.', 'εφ.', 'εφημ.', | |
'θρησκ.', 'κ.ά.', 'κ.α.', 'κ.εξ.', 'κ.επ.', 'κ.λ.π.', 'κ.λπ.', 'κ.ο.κ.', | |
'κ.τ.λ.', 'κ.τ.τ.', 'κ.τ.ό.', 'καν.', 'κατάλ.', 'κεφ.', 'κλ.', 'κλπ.', 'κτλ.', | |
'λ.', 'λ.χ.', 'λέξ.', 'λατ.', 'λατιν.', 'μ.', 'μ.Χ.', 'μ.ά.', 'μετ.', | |
'μετφρ.', 'μτγ.', 'μτγν.', 'μτφ.', 'μτφρ.', 'μτχ.', 'μυθ.', 'νεοελλ.', | |
'νεολ.', 'νεολατ.', 'νεολατιν.', 'νεότ.', 'ον.', 'ονομ.', 'π.Χ.', 'π.μ.', | |
'π.χ.', 'πίν.', 'παρ.', 'περ.', 'πληθ.', 'πολγρ.', 'πρβ.', 'πρβλ.', 'πργρ.', | |
'προφ.', 'πρότ.', 'πτ.', 'ρ.', 'ρήμ.', 'σ.', 'σ.π.', 'σεβ.', 'σελ.', 'σημ.', | |
'σπ.', 'σπάν.', 'σσ.', 'στ.', 'στίχ.', 'στατ.', 'στχ.', 'συμπλ.', 'σχ.', | |
'τ.', 'τ.μ.', 'τεύχ.', 'τιμ. τόμ.', 'τμ.', 'τυπ.', 'τόμ.', 'υποσ.', 'υποσημ.', | |
'υποφ.', 'φ.', 'φάκ.', 'φιλολ.', 'φιλοσ.', 'χ.κ.', 'χ.σ.', 'χ.τ.', 'χ.τ.χ.', | |
'χ.χ.', 'χ.ό.', 'χγφ.', 'χργρ.', 'χφ.', 'χφο.', 'χφφ.', 'χχφ.', 'ό,τι', | |
'ό.π.', 'όμ.', 'όπ.π.', 'όπ.παρ.' | |
] | |
def tokenize_greek_sentence(trie, sentence, match_abbrv=False): | |
tokens = [] | |
current_token = "" | |
i = 0 | |
while i < len(sentence): | |
# Check for abbreviations | |
matched_abbrv = trie.find_longest_prefix(sentence[i:]) if match_abbrv else None | |
if matched_abbrv: | |
# if current_token: | |
# tokens.append(current_token) | |
# current_token = "" | |
tokens.append(matched_abbrv[1]) | |
i += matched_abbrv[0] + 1 | |
else: | |
char = sentence[i] | |
next_char = sentence[i + 1] if i + 1 < len(sentence) else "" | |
if is_part_of_word(char, next_char, current_token): | |
current_token += char | |
else: | |
if current_token: | |
tokens.append(current_token) | |
current_token = "" | |
if is_quotation_mark(char) or is_punctuation(char): | |
tokens.append(char) | |
# elif not char.isspace(): | |
# tokens.append(char) | |
i += 1 | |
if current_token: | |
tokens.append(current_token) | |
return tokens | |
def process_file(filename, dictionary_file, min_occurrences=1): | |
word_counter = Counter() | |
misfits = set() | |
not_in_dictionary = set() | |
# Build the trie | |
trie = Trie() | |
for word in greek_abbreviations: | |
trie.insert(word) | |
# Load dictionary | |
with open(dictionary_file, 'r', encoding='utf-8') as f: | |
dictionary = set(f.read().splitlines()) | |
# Process the gzipped file | |
with gzip.open(filename, 'rt', encoding='utf-8') as file: | |
for line in file: | |
# Apply processing functions | |
line = remove_greek_accents_from_upper(line) | |
words = tokenize_greek_sentence(trie, line) | |
first_word = True | |
for word in words: | |
word = lower_first_if_title(word) | |
# if (not first_word and word.istitle()) or \ | |
# not (validate_diaeresis_mark(word.lower()) and \ | |
# validate_accented_word(word.lower())): | |
# # print('Validation error:', word) | |
# continue | |
# if (len(word) <= 5 or len(list(greek_vowels_iterator(word))) <= 2) and \ | |
if word not in dictionary: | |
not_in_dictionary.add(word) | |
continue | |
word_counter[word] += 1 | |
if first_word: | |
first_word = False | |
# Handle misfits | |
for word, count in word_counter.copy().items(): | |
if count < min_occurrences: | |
misfits.add(word) | |
del word_counter[word] | |
# Write results | |
with open('misfits.txt', 'w', encoding='utf-8') as f: | |
for word in sorted(misfits): | |
f.write(f"{word}\n") | |
with open('not_in_dictionary.txt', 'w', encoding='utf-8') as f: | |
for word in sorted(not_in_dictionary): | |
f.write(f"{word}\n") | |
with open('word_frequency.json', 'w', encoding='utf-8') as f: | |
json.dump(dict(word_counter), f, ensure_ascii=False, indent=2) | |
return word_counter, misfits, not_in_dictionary | |
if __name__ == "__main__": | |
input_file = "el.txt.gz" | |
dictionary_file = "el_GR.dic" | |
word_counter, misfits, not_in_dictionary = process_file(input_file, dictionary_file) | |
print(f"Processed {len(word_counter)} unique words") | |
print(f"Found {len(misfits)} words with less than 50 occurrences") | |
print(f"Found {len(not_in_dictionary)} words not in the dictionary") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment