Created
November 26, 2024 03:44
-
-
Save ssskip/06c5ea1f4017d540c000303d07b15a2b to your computer and use it in GitHub Desktop.
Normalize text by replacing similar characters(virtual similar words) with their standard form.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
# Dictionary mapping similar characters include unicode to their normalized form | |
CHAR_MAPPINGS = { | |
# Map variations of 'a' to 'a' | |
'a': r"[a4@àáâäãåαаǎăąāạⱥₐᵃᴀꙺꬱ\u0251\u1D41A\u1D44E\u1D482\u1D4B6\u1D4EA\u1D51E\u1D552\u1D586\u1D5BA\u1D5EE\u1D622\u1D656\u1D68A]", | |
# Map variations of 'b' to 'b' | |
'b': r"[b6ƀɓḃḅḇƃȸ\u1D41\u1D45\u1D79\u1D7D\u1D422\u1D456\u1D48A\u1D4BE\u1D4F2\u1D526\u1D55A\u1D58E\u1D5C2]", | |
# Map variations of 'c' to 'c' | |
'c': r"[c\(\)çćĉċč\u1D04\u1D9C\u1D40\u1D74\u1D48\u1D4C]", | |
# Map variations of 'd' to 'd' | |
'd': r"[dðďḋḍḏḑḓɗ\u1D48\u1D5A\u1D6D\u1D6E\u1D6F\u1D70]", | |
# Map variations of 'e' to 'e' | |
'e': r"[e3€èéêëēĕęėẹẻẽəɛ\u1D49\u1D92\u1D07\u1D41\u1D452\u1D486\u1D4BA\u1D4EE\u1D522\u1D556\u1D58A\u1D5BE]", | |
# Map variations of 'f' to 'f' | |
'f': r"[fƒḟ\u1D46\u1D5B\u1D6A\u1D6B\u1D6C\u1D6D]", | |
# Map variations of 'g' to 'g' | |
'g': r"[g9qöġģǵğǧĝᵍ\u1D33\u1D4D\u1D83\u1D9E]", | |
# Map variations of 'h' to 'h' | |
'h': r"[hħḣḥḧḩḫẖɦ\u02B0\u1D34\u1D5D\u1D6E\u1D6F\u1D70]", | |
# Map variations of 'i' to 'i' | |
'i': r"[i1l|!îíìɩɨıĩïỉįīɪ\u0456\u03B9\u1D62\u1D7B\u1D96\u1D4E\u1DA4\u1D5BA\u1D622\u1D68A]", | |
# Map variations of 'j' to 'j' | |
'j': r"[jĵǰĵȷϳ\u1D0A\u1D36\u1D6A\u1D6B\u1D6C\u1D6D]", | |
# Map variations of 'k' to 'k' | |
'k': r"[kƙķʞκḱḳḵǩқҝҟ\u1D4F\u1D427\u1D45B\u1D48F\u1D4C3\u1D4F7\u1D52B\u1D55F\u1D593\u1D5C7\u1D5FB\u1D62F\u1D663]", | |
# Map variations of 'm' to 'm' | |
'm': r"[mnmṁṃṁḿṁミ\u1D0D\u1D36\u1D5E\u1D7F\u1D6F]", | |
# Map variations of 'l' to 'l' | |
'l': r"[l1i|ĺļľłɭḷḹḻḽƚⱡ\u02E1\u1D26\u1D5BA\u1D5EE\u1D622\u1D656\u1D68A\u2113\u2110\u2111\u2160\u2170]", | |
# Map variations of 'n' to 'n' | |
'n': r"[nñńņňṅṇṉṋ\u1D0E\u1D5F\u1D70]", | |
# Map variations of 'o' to 'o' | |
'o': r"[o0qöòóôöõøǒőŏơọỏốồổỗớờởỡợɵ\u1D52\u1D11\u1D45\u1D4F\u1D522\u1D556\u1D58A\u1D5BE]", | |
# Map variations of 'p' to 'p' | |
'p': r"[pρṕṗƥ\u1D18\u1D5C\u1D6D\u1D6E\u1D6F\u1D70\u1D71]", | |
# Map variations of 'q' to 'q' | |
'q': r"[qg9öգզզ\u1D2A\u1D60\u1D61\u1D62]", | |
# Map variations of 'r' to 'r' | |
'r': r"[rŕŗřṙṛṝṟ\u1D1D\u1D5A\u1D6A\u1D6B\u1D6C\u1D6D\u1D6E]", | |
# Map variations of 's' to 's' | |
's': r"[s5$śŝşšṡṣ\u1D19\u1D5B\u1D6E\u1D6F\u1D70]", | |
# Map variations of 't' to 't' | |
't': r"[t7+ţťŧƫṭṯṱẗⲧ†\u22A3\u1D57\u1D42D\u1D461\u1D495\u1D4C9\u1D4FD\u1D531\u1D565\u1D599\u1D5CD\u1D601\u1D635\u1D669]", | |
# Map variations of 'u' to 'u' | |
'u': r"[uûùúüũūŭůűųưụủứừửữự\u1D1C\u1D1F\u1D4F\u1D5C\u1D6E\u1D6F\u1D70]", | |
# Map variations of 'v' to 'v' | |
'v': r"[vṽṿ\u1D20\u1D5D\u1D6E\u1D6F\u1D70]", | |
# Map variations of 'w' to 'w' | |
'w': r"[wẁẃẅẇẉŵ\u02B7\u1D21\u1D5E\u1D6F]", | |
# Map variations of 'x' to 'x' | |
'x': r"[x×ẋẍ\u1D22\u1D5F\u1D70]", | |
# Map variations of 'y' to 'y' | |
'y': r"[yýÿŷỳỵỷỹ\u1D23\u1D60\u1D6E\u1D6F\u1D70]", | |
# Map variations of 'z' to 'z' | |
'z': r"[zźżž\u1D22\u1D5B\u1D6A\u1D6B\u1D6C\u1D6D]", | |
} | |
# Compile all patterns | |
PATTERNS = {char: re.compile(pattern, re.UNICODE) for char, pattern in CHAR_MAPPINGS.items()} | |
def normalize_text(text): | |
""" | |
Normalize text by replacing similar characters with their standard form. | |
Args: | |
text (str): Input text to normalize | |
Returns: | |
str: Normalized text | |
""" | |
normalized = text.lower() | |
# remove all invisible characters (e.g. zero-width spaces, unicode characters, etc.) | |
# https://docs.python.org/3/library/stdtypes.html#str.isprintable | |
# Nonprintable characters are those characters defined in the Unicode character database as “Other” or “Separator”, excepting the ASCII space (0x20) which is considered printable. | |
normalized = "".join([char for char in normalized if char.isprintable()]) | |
for standard_char, pattern in PATTERNS.items(): | |
normalized = pattern.sub(standard_char, normalized) | |
return normalized |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment