Skip to content

Instantly share code, notes, and snippets.

@ssskip
Created November 26, 2024 03:44
Show Gist options
  • Save ssskip/06c5ea1f4017d540c000303d07b15a2b to your computer and use it in GitHub Desktop.
Save ssskip/06c5ea1f4017d540c000303d07b15a2b to your computer and use it in GitHub Desktop.
Normalize text by replacing similar characters(virtual similar words) with their standard form.
import re
# Dictionary mapping similar characters include unicode to their normalized form
CHAR_MAPPINGS = {
# Map variations of 'a' to 'a'
'a': r"[a4@àáâäãåαаǎăąāạⱥₐᵃᴀꙺꬱ\u0251\u1D41A\u1D44E\u1D482\u1D4B6\u1D4EA\u1D51E\u1D552\u1D586\u1D5BA\u1D5EE\u1D622\u1D656\u1D68A]",
# Map variations of 'b' to 'b'
'b': r"[b6ƀɓḃḅḇƃȸ\u1D41\u1D45\u1D79\u1D7D\u1D422\u1D456\u1D48A\u1D4BE\u1D4F2\u1D526\u1D55A\u1D58E\u1D5C2]",
# Map variations of 'c' to 'c'
'c': r"[c\(\)çćĉċč\u1D04\u1D9C\u1D40\u1D74\u1D48\u1D4C]",
# Map variations of 'd' to 'd'
'd': r"[dðďḋḍḏḑḓɗ\u1D48\u1D5A\u1D6D\u1D6E\u1D6F\u1D70]",
# Map variations of 'e' to 'e'
'e': r"[e3€èéêëēĕęėẹẻẽəɛ\u1D49\u1D92\u1D07\u1D41\u1D452\u1D486\u1D4BA\u1D4EE\u1D522\u1D556\u1D58A\u1D5BE]",
# Map variations of 'f' to 'f'
'f': r"[fƒḟ\u1D46\u1D5B\u1D6A\u1D6B\u1D6C\u1D6D]",
# Map variations of 'g' to 'g'
'g': r"[g9qöġģǵğǧĝᵍ\u1D33\u1D4D\u1D83\u1D9E]",
# Map variations of 'h' to 'h'
'h': r"[hħḣḥḧḩḫẖɦ\u02B0\u1D34\u1D5D\u1D6E\u1D6F\u1D70]",
# Map variations of 'i' to 'i'
'i': r"[i1l|!îíìɩɨıĩïỉįīɪ\u0456\u03B9\u1D62\u1D7B\u1D96\u1D4E\u1DA4\u1D5BA\u1D622\u1D68A]",
# Map variations of 'j' to 'j'
'j': r"[jĵǰĵȷϳ\u1D0A\u1D36\u1D6A\u1D6B\u1D6C\u1D6D]",
# Map variations of 'k' to 'k'
'k': r"[kƙķʞκḱḳḵǩқҝҟ\u1D4F\u1D427\u1D45B\u1D48F\u1D4C3\u1D4F7\u1D52B\u1D55F\u1D593\u1D5C7\u1D5FB\u1D62F\u1D663]",
# Map variations of 'm' to 'm'
'm': r"[mnmṁṃṁḿṁミ\u1D0D\u1D36\u1D5E\u1D7F\u1D6F]",
# Map variations of 'l' to 'l'
'l': r"[l1i|ĺļľłɭḷḹḻḽƚⱡ\u02E1\u1D26\u1D5BA\u1D5EE\u1D622\u1D656\u1D68A\u2113\u2110\u2111\u2160\u2170]",
# Map variations of 'n' to 'n'
'n': r"[nñńņňṅṇṉṋ\u1D0E\u1D5F\u1D70]",
# Map variations of 'o' to 'o'
'o': r"[o0qöòóôöõøǒőŏơọỏốồổỗớờởỡợɵ\u1D52\u1D11\u1D45\u1D4F\u1D522\u1D556\u1D58A\u1D5BE]",
# Map variations of 'p' to 'p'
'p': r"[pρṕṗƥ\u1D18\u1D5C\u1D6D\u1D6E\u1D6F\u1D70\u1D71]",
# Map variations of 'q' to 'q'
'q': r"[qg9öգզզ\u1D2A\u1D60\u1D61\u1D62]",
# Map variations of 'r' to 'r'
'r': r"[rŕŗřṙṛṝṟ\u1D1D\u1D5A\u1D6A\u1D6B\u1D6C\u1D6D\u1D6E]",
# Map variations of 's' to 's'
's': r"[s5$śŝşšṡṣ\u1D19\u1D5B\u1D6E\u1D6F\u1D70]",
# Map variations of 't' to 't'
't': r"[t7+ţťŧƫṭṯṱẗⲧ†\u22A3\u1D57\u1D42D\u1D461\u1D495\u1D4C9\u1D4FD\u1D531\u1D565\u1D599\u1D5CD\u1D601\u1D635\u1D669]",
# Map variations of 'u' to 'u'
'u': r"[uûùúüũūŭůűųưụủứừửữự\u1D1C\u1D1F\u1D4F\u1D5C\u1D6E\u1D6F\u1D70]",
# Map variations of 'v' to 'v'
'v': r"[vṽṿ\u1D20\u1D5D\u1D6E\u1D6F\u1D70]",
# Map variations of 'w' to 'w'
'w': r"[wẁẃẅẇẉŵ\u02B7\u1D21\u1D5E\u1D6F]",
# Map variations of 'x' to 'x'
'x': r"[x×ẋẍ\u1D22\u1D5F\u1D70]",
# Map variations of 'y' to 'y'
'y': r"[yýÿŷỳỵỷỹ\u1D23\u1D60\u1D6E\u1D6F\u1D70]",
# Map variations of 'z' to 'z'
'z': r"[zźżž\u1D22\u1D5B\u1D6A\u1D6B\u1D6C\u1D6D]",
}
# Compile all patterns
PATTERNS = {char: re.compile(pattern, re.UNICODE) for char, pattern in CHAR_MAPPINGS.items()}
def normalize_text(text):
"""
Normalize text by replacing similar characters with their standard form.
Args:
text (str): Input text to normalize
Returns:
str: Normalized text
"""
normalized = text.lower()
# remove all invisible characters (e.g. zero-width spaces, unicode characters, etc.)
# https://docs.python.org/3/library/stdtypes.html#str.isprintable
# Nonprintable characters are those characters defined in the Unicode character database as “Other” or “Separator”, excepting the ASCII space (0x20) which is considered printable.
normalized = "".join([char for char in normalized if char.isprintable()])
for standard_char, pattern in PATTERNS.items():
normalized = pattern.sub(standard_char, normalized)
return normalized
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment