Skip to content

Instantly share code, notes, and snippets.

@mbollmann
Created March 2, 2022 14:12
Show Gist options
  • Save mbollmann/0de1af38fce9f62c514cfe62c3f4cbdd to your computer and use it in GitHub Desktop.
Save mbollmann/0de1af38fce9f62c514cfe62c3f4cbdd to your computer and use it in GitHub Desktop.
Access Unicode Script property in Python & find out which script(s) a string contains
#!/usr/bin/env python3
# Unicode characters are neatly categorized into different "scripts", as seen on
# the character code chart <http://www.unicode.org/charts/#scripts> and defined
# in Annex #24 <https://www.unicode.org/reports/tr24/>.
#
# Unfortunately, Python's unicodedata module doesn't provide access to this
# information. However, the fontTools library does include this.
# <https://github.com/fonttools/fonttools>
#
# Here's an example how to use it.
#
# Requires:
# - fonttools (tested with version 4.29.1)
# - rich (optional; just comment out the import)
from collections import Counter
from fontTools.unicodedata import script, script_name
from rich import print
def detect_scripts(text):
"""Returns a Counter mapping scripts to how often characters from that
script appear in `text`.
"""
characters = Counter(text)
scripts = Counter()
for char, count in characters.items():
scripts[script(char)] += count
return scripts
def detect_script_ratios(text):
scripts = detect_scripts(text)
total = sum(scripts.values())
return {script_name(s, default="Unknown"): c / total for s, c in scripts.items()}
def get_majority_script(text):
scripts = detect_scripts(text)
for s, _ in scripts.most_common(3):
name = script_name(s, default=None)
if name not in ("Common", None):
return name
return None
if __name__ == "__main__":
# From uk.wikipedia.org
sample = """21 квітня 2021 р. Pfizer заявляє, що їй відомо про підроблені версіях своєї
вакцини COVID-19, які виробник ліків розробив за допомогою BioNTech, оскільки
злочинці прагнуть нажитися на світовому попиті на вакцини, який продовжує
випереджати пропозицію."""
print(detect_scripts(sample))
# Counter({'Zyyy': 97, 'Cyrl': 183, 'Latn': 19})
print(detect_script_ratios(sample))
# {'Common': 0.32441471571906355, 'Cyrillic': 0.6120401337792643, 'Latin': 0.06354515050167224}
print(get_majority_script(sample))
# Cyrillic
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment