Created
March 2, 2022 14:12
-
-
Save mbollmann/0de1af38fce9f62c514cfe62c3f4cbdd to your computer and use it in GitHub Desktop.
Access Unicode Script property in Python & find out which script(s) a string contains
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
# Unicode characters are neatly categorized into different "scripts", as seen on | |
# the character code chart <http://www.unicode.org/charts/#scripts> and defined | |
# in Annex #24 <https://www.unicode.org/reports/tr24/>. | |
# | |
# Unfortunately, Python's unicodedata module doesn't provide access to this | |
# information. However, the fontTools library does include this. | |
# <https://github.com/fonttools/fonttools> | |
# | |
# Here's an example how to use it. | |
# | |
# Requires: | |
# - fonttools (tested with version 4.29.1) | |
# - rich (optional; just comment out the import) | |
from collections import Counter | |
from fontTools.unicodedata import script, script_name | |
from rich import print | |
def detect_scripts(text): | |
"""Returns a Counter mapping scripts to how often characters from that | |
script appear in `text`. | |
""" | |
characters = Counter(text) | |
scripts = Counter() | |
for char, count in characters.items(): | |
scripts[script(char)] += count | |
return scripts | |
def detect_script_ratios(text): | |
scripts = detect_scripts(text) | |
total = sum(scripts.values()) | |
return {script_name(s, default="Unknown"): c / total for s, c in scripts.items()} | |
def get_majority_script(text): | |
scripts = detect_scripts(text) | |
for s, _ in scripts.most_common(3): | |
name = script_name(s, default=None) | |
if name not in ("Common", None): | |
return name | |
return None | |
if __name__ == "__main__": | |
# From uk.wikipedia.org | |
sample = """21 квітня 2021 р. Pfizer заявляє, що їй відомо про підроблені версіях своєї | |
вакцини COVID-19, які виробник ліків розробив за допомогою BioNTech, оскільки | |
злочинці прагнуть нажитися на світовому попиті на вакцини, який продовжує | |
випереджати пропозицію.""" | |
print(detect_scripts(sample)) | |
# Counter({'Zyyy': 97, 'Cyrl': 183, 'Latn': 19}) | |
print(detect_script_ratios(sample)) | |
# {'Common': 0.32441471571906355, 'Cyrillic': 0.6120401337792643, 'Latin': 0.06354515050167224} | |
print(get_majority_script(sample)) | |
# Cyrillic |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment