Last active
March 24, 2025 01:09
-
-
Save thewh1teagle/1bc1734c5edad1f99f34729d78d9e90b to your computer and use it in GitHub Desktop.
Hebrew encoder decoder for diacritics model
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Hebrew diacritics encoding and decoding | |
""" | |
import unicodedata | |
import re | |
# Deduplicate duplicate phonetic diacritics | |
NIQQUD_DEDUPLICATE = { | |
"\u05b1": "\u05b5", # Hataf segol -> Tsere | |
"\u05b6": "\u05b5", # Segol -> Tsere | |
"\u05b2": "\u05b7", # Hataf patah -> Patah | |
"\u05b3": "\u05b7", # Hataf qamats -> Patah | |
"\u05b8": "\u05b7", # Qmats -> Patah | |
"\u05c7": "\u05b9", # Qamats qatan -> Holam | |
} | |
NIQQUD_MAP = { | |
None: 0, | |
"\u05b0": 1, # Shva | |
"\u05b4": 2, # Hirik | |
"\u05b5": 3, # Tsere | |
"\u05b7": 4, # Patah | |
"\u05bb": 5, # Qubuts | |
"\u05b9": 6, # Holam | |
"\u05ba": 7, # Holam haser for vav | |
} | |
# UTF-8 characters names of niqqud | |
NIQQUD_NAME_MAP = { | |
"HEBREW POINT SHEVA": "\u05b0", | |
"HEBREW POINT HIRIQ": "\u05b4", | |
"HEBREW POINT TSERE": "\u05b5", | |
"HEBREW POINT PATAH": "\u05b7", | |
"HEBREW POINT QUBUTS": "\u05bb", | |
"HEBREW POINT HOLAM": "\u05b9", | |
"HEBREW POINT HOLAM HASER FOR VAV": "\u05ba", | |
} | |
# Tokenization | |
# Shin/Sin dot | |
SHIN_FLAG_OFF = 0 | |
SHIN_FLAG_RIGHT = 1 | |
SHIN_FLAG_LEFT = 2 | |
# Dagesh | |
DAGESH_FLAG_OFF = 0 | |
DAGESH_FLAG_ON = 1 | |
# Special tokens | |
SPACE_TOKEN = -1 | |
UNKNOWN_TOKEN = -2 | |
PRESERVE_UKNOWN_TOKEN = -3 # Non-Hebrew but valid characters | |
# Hebrew letters and diacritics IDs | |
HEBREW_LETTERS = "אבגדהוזחטיכלמנסעפצקרשתךםןףץ" | |
LETTER_TO_ID = {char: idx for idx, char in enumerate(HEBREW_LETTERS)} | |
ID_TO_LETTER = {v: k for k, v in LETTER_TO_ID.items()} | |
ID_TO_NIQQUD = {v: k for k, v in NIQQUD_MAP.items()} | |
NIQQUD_TO_ID = {k: v for v, k in NIQQUD_MAP.items()} | |
def remove_niqqud(text): | |
""" | |
Removes all diacritics from a Hebrew text. | |
""" | |
return re.sub(r"[\u05B0-\u05C7]", "", text) | |
def sort_dagesh(text): | |
""" | |
Sorts dagesh to the right of the letter when multiple diacritics are present. | |
""" | |
return re.sub(r"([^בכךפףו])(\u05bc)", r"\1", text) | |
def normalize(text): | |
text = unicodedata.normalize("NFD", text) | |
text = remove_unnecessary_dagesh(text) | |
text = sort_dagesh(text) | |
# Apply niqqud deduplication on result of dagesh removal | |
normalized = [] | |
for char in text: | |
normalized.append(NIQQUD_DEDUPLICATE.get(char, char)) | |
# Normalize back to NFD so diacritics stay decomposed for parsing | |
return unicodedata.normalize("NFD", ''.join(normalized)) | |
def remove_unnecessary_dagesh(text): | |
# Normalize to NFD to split base and marks | |
text = unicodedata.normalize("NFD", text) | |
def replacer(match): | |
base = match.group(0)[0] | |
diacritics = match.group(0)[1:] | |
if not can_dagesh(base): | |
diacritics = diacritics.replace("\u05bc", "") | |
return base + diacritics | |
pattern = re.compile(r"[א-ת][\u05b0-\u05c7]+") | |
cleaned = pattern.sub(replacer, text) | |
return unicodedata.normalize("NFD", cleaned) | |
def can_dagesh(letter): | |
return letter in "בכפךף" | |
def can_shin_sin(letter): | |
return letter in "ש" | |
def can_vav_holam_haser(letter): | |
return letter in "ו" | |
def encode_hebrew_char(char, niqqud=None, dagesh_flag=DAGESH_FLAG_OFF, shin_dot_flag=SHIN_FLAG_OFF): | |
""" | |
Encodes a single Hebrew character with diacritics into a numeric feature vector. | |
""" | |
char_id = LETTER_TO_ID.get(char, -1) # Handle unknowns | |
niqqud_id = NIQQUD_MAP.get(niqqud, 0) | |
dagesh_flag = int(dagesh_flag) if can_dagesh(char) else 0 | |
return [char_id, niqqud_id, dagesh_flag, shin_dot_flag] | |
def encode_sentence(text): | |
""" | |
Encodes a fully diacritized Hebrew sentence into feature vectors. | |
Non-Hebrew or irrelevant characters (punctuation, digits, symbols) are replaced with UNKNOWN_TOKEN. | |
Matrix look like this: | |
[ | |
[char_id, niqqud_id, dagesh_flag, shin_flag], | |
[char_id, niqqud_id, dagesh_flag, shin_flag], | |
[PRESERVE_UKNOWN_TOKEN, ord(char), 0, 0], | |
[-1, 0, 0, 0], # Space | |
[-2, 0, 0, 0], # Unknown | |
... | |
] | |
""" | |
text = normalize(text) | |
encoded = [] | |
i = 0 | |
while i < len(text): | |
char = text[i] | |
# Handle space | |
if char == " ": | |
encoded.append([SPACE_TOKEN, 0, 0, 0]) # SPACE_TOKEN | |
i += 1 | |
continue | |
# Skip punctuation, digits, Latin, etc. | |
if unicodedata.category(char) != 'Lo': # Not a Hebrew letter | |
encoded.append([PRESERVE_UKNOWN_TOKEN, ord(char), 0, 0]) # Save original char's Unicode | |
i += 1 | |
continue | |
base = char | |
niqqud = None | |
dagesh = False | |
shin_dot = None | |
i += 1 | |
while i < len(text) and unicodedata.category(text[i]) in ('Mn', 'Sk'): | |
mark = text[i] | |
name = unicodedata.name(mark, "") | |
if "DAGESH" in name: | |
dagesh = True | |
niqqud_candidate = NIQQUD_NAME_MAP.get(name, None) | |
if niqqud_candidate: | |
niqqud = niqqud_candidate | |
elif "POINT SIN" in name: | |
shin_dot = SHIN_FLAG_LEFT | |
elif "POINT SHIN" in name: | |
shin_dot = SHIN_FLAG_RIGHT | |
elif "POINT HOLAM HASER FOR VAV" in name: | |
niqqud = "\u05ba" | |
i += 1 | |
encoded.append(encode_hebrew_char(base, niqqud, dagesh, shin_dot)) | |
return encoded | |
def decode_sentence(encoded): | |
result = [] | |
for char_id, niqqud_id, dagesh_flag, shin_flag in encoded: | |
if char_id == -1: | |
result.append(" ") # SPACE_TOKEN | |
continue | |
elif char_id == -2: | |
result.append("❓") # UNKNOWN_TOKEN | |
continue | |
elif char_id == -3: | |
result.append(chr(niqqud_id)) # SPECIAL_CHAR_TOKEN | |
continue | |
base_char = ID_TO_LETTER.get(char_id, "❓") | |
marks = [] | |
if dagesh_flag: | |
marks.append("\u05bc") | |
niqqud = ID_TO_NIQQUD.get(niqqud_id, None) | |
if niqqud: | |
marks.append(niqqud) | |
if can_shin_sin(base_char) and shin_flag: | |
marks.append("\u05c1" if shin_flag == 1 else "\u05c2") | |
result.append(base_char + ''.join(marks)) | |
return ''.join(result) | |
for sentence in [ | |
"בְּרֵאשִׁית בָּרָא אֱלֹהִים אֵת הַשָּׁמַיִים וְאֵת הָאָרֶץ", | |
# with english | |
"בְּרֵאשִׁית בָּרָא אֱלֹהִים אֵת הַשָּׁמַיִים וְאֵת הָאָרֶץ Genesis 1:1", | |
# with numbers | |
"בְּרֵאשִׁית בָּרָא אֱלֹהִים אֵת הַשָּׁמַיִים וְאֵת הָאָרֶץ 1:1", | |
# with punctuation | |
"בְּרֵאשִׁית, בָּרָא אֱלֹהִים אֵת הַשָּׁמַיִים וְאֵת הָאָרֶץ", | |
# with symbols | |
"בְּרֵאשִׁית, בָּרָא אֱלֹהִים אֵת הַשָּמַיִים וְאֵת הָאָרֶץ 🌍", | |
# with mixed languages | |
"בְּרֵאשִׁית, בָּרָא אֱלֹהִים אֵת הַשָּמַיִים וְאֵת הָאָרֶץ 🌍 Genesis 1:1", | |
# with unknown characters | |
"בְּרֵאשִׁית, בָּרָא אֱלֹהִים אֵת הַשָּמַיִים וְאֵת הָאָרֶץ 🌍 Genesis 1:1", | |
# Sofiyot | |
"מְהַמֵּם כֻּלָּם" | |
]: | |
sentence = normalize(sentence) | |
decoded = decode_sentence(encode_sentence(sentence)) | |
assert sentence == decoded, f"Decoding failed: {sentence} != {decoded}" | |
print(f'Original: "{sentence}"') | |
print(f'Decoded: "{decoded}"') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment