Letter frequency of text

Based on python code posted on LinkedIn by Alekya D.

Using a tweaked version of Alice in Wonderland and the Dinka Padang translation of the UDHR

import collections
import regex as re

def isalpha_(text):
    if len(text) == 1:
        result = bool(re.match(r'[\p{Alphabetic}\p{Mn}\p{Mc}·]', text))
    else:
        result = bool(re.match(r'^\p{Alphabetic}[\p{Alphabetic}\p{Mn}\p{Mc}·]*$', text))
    return result

chars = re.findall(r'\X', open("alice_in_wonderland.txt", "r").read().lower())

charFreq = dict(collections.Counter(filter(lambda c: isalpha_(c), chars)))

print("Most frequent letter: ", max(charFreq, key = charFreq.get))
print("Most infrequent letter: ", min(charFreq, key = charFreq.get))
print("\nThe frequency of English letters:", charFreq)
# print("\nAlphabet: ", {key:charFreq[key] for key in sorted(charFreq.keys())})

The Python str.lower() casing operation is language invariant, it does not implement language specific tailorings to casing. In order to improve lowercasing we can use PyICU, create a Locale object and lowercase text using the desired locale:

import collections
import regex as re
from icu import Locale, UnicodeString

DEFAULT_LOCALE = Locale("en_AU")
# ROOT_LOCALE = Locale.getRoot()

def isalpha_(text):
    if len(text) == 1:
        result = bool(re.match(r'[\p{Alphabetic}\p{Mn}\p{Mc}·]', text))
    else:
        result = bool(re.match(r'^\p{Alphabetic}[\p{Alphabetic}\p{Mn}\p{Mc}·]*$', text))
    return result

chars = re.findall(r'\X', open("alice_in_wonderland.txt", "r").read())
chars = [str(UnicodeString(x).toLower(DEFAULT_LOCALE)) for x in chars]

charFreq = dict(collections.Counter(filter(lambda c: isalpha_(c), chars)))

print("Most frequent letter: ", max(charFreq, key = charFreq.get))
print("Most infrequent letter: ", min(charFreq, key = charFreq.get))
print("\nThe frequency of English letters:", charFreq)
# print("\nAlphabet: ", {key:charFreq[key] for key in sorted(charFreq.keys())})

A further enhancement would be an option to choose whether you count graphemes or characters:

import collections
import regex as re
from icu import Locale, UnicodeString

DEFAULT_LOCALE = Locale("en_AU")
# ROOT_LOCALE = Locale.getRoot()

def isalpha_(text):
    if len(text) == 1:
        result = bool(re.match(r'[\p{Alphabetic}\p{Mn}\p{Mc}·]', text))
    else:
        result = bool(re.match(r'^\p{Alphabetic}[\p{Alphabetic}\p{Mn}\p{Mc}·]*$', text))
    return result

def letter_frequency(doc, grapheme=True):
    chars = re.findall(r'\X', doc) if grapheme else [char for char in doc]
    chars = [str(UnicodeString(x).toLower(DEFAULT_LOCALE)) for x in chars]
    return dict(collections.Counter(filter(lambda c: isalpha_(c), chars)))

with open('alice_in_wonderland.txt') as f:
    doc = f.read()

charFreq = letter_frequency(doc, grapheme=False)

print("Most frequent letter: ", max(charFreq, key = charFreq.get))
print("Most infrequent letter: ", min(charFreq, key = charFreq.get))
print("\nThe frequency of English letters:", charFreq)
# print("\nAlphabet: ", {key:charFreq[key] for key in sorted(charFreq.keys())})

An attempt to handle languages that have digraphs or trigraphs as alphabetic characters:

from collections import OrderedDict, Counter
import regex as re
from icu import Locale, UnicodeString, Collator

#LOCALE_ID = "en_AU"
LOCALE_ID = "din_SS"

if LOCALE_ID in Locale.getAvailableLocales().keys():
    DEFAULT_LOCALE = Locale(LOCALE_ID)
else:
    DEFAULT_LOCALE = Locale.getRoot()

if LOCALE_ID in Collator.getAvailableLocales().keys():
    collator = Collator.createInstance(Locale(LOCALE_ID))
else:
    collator = Collator.createInstance(Locale.getRoot())

ngraph_ipa_map = {
    "din": {
        "ngraphs": {
            "ää": 'ā̤',
            "aa": "ā",
            "ëë": "ē̤",
            "ee": "ē",
            "ɛ̈ɛ̈": "ɛ̤̄",
            "ɛɛ": "ɛ̄",
            "ïï": "ī̤",
            "ii": "ī",
            "öö": "ō̤",
            "oo": "ō",
            "ɔ̈ɔ̈": "ɔ̤̄",
            "ɔɔ": "ɔ̄",
            "uu": "ū",
            "dh": "d̪",
            "nh": "n̪",
            "ny": "ɲ",
            "th": "t̪"
        },
        "regex": {}
    }
}

def apply_map(s, map, reverse=False):
    od_map = OrderedDict(ngraph_ipa_map[map]["ngraphs"])
    if reverse:
        for i, j in od_map.items():
            s = s.replace(j, i)
    else:
        for i, j in od_map.items():
            s = s.replace(i, j)
    return s

def isalpha_(text):
    if len(text) == 1:
        result = bool(re.match(r'[\p{Alphabetic}\p{Mn}\p{Mc}·]', text))
    else:
        result = bool(re.match(r'^\p{Alphabetic}[\p{Alphabetic}\p{Mn}\p{Mc}·]*$', text))
    return result

def letter_frequency(doc, grapheme=True, map=None):
    if map:
        doc = apply_map(doc, map)
    chars = re.findall(r'\X', doc) if grapheme else [char for char in doc]
    if map:
        chars = [apply_map(str(UnicodeString(x).toLower(DEFAULT_LOCALE)), map, reverse=True) for x in chars]
    else:
        chars = [str(UnicodeString(x).toLower(DEFAULT_LOCALE)) for x in chars]
    return dict(Counter(filter(lambda c: isalpha_(c), chars)))

# text_file = "alice_in_wonderland.txt"
# map = None
text_file = 'udhr_dip.txt'
map = "din"

with open(text_file) as f:
    doc = f.read()

charFreq = letter_frequency(doc, grapheme=True, map=map)

max_value = max(charFreq.items(), key=lambda x: x[1])[1]
print("Most frequent letter(s): ", [k for k,v in charFreq.items() if v == max_value])
min_value = min(charFreq.items(), key=lambda x: x[1])[1]
print("Most infrequent letter(s): ", [k for k,v in charFreq.items() if v == min_value])
print("\nThe frequency of the letters:", str({k: v for k, v in sorted(charFreq.items(), key=lambda item: item[1], reverse=True)}))

def add_counts(s, l):
    if s in charFreq.keys() and l in charFreq.keys():
        t = charFreq[s] + charFreq[l]
    elif s in charFreq.keys() and not l in charFreq.keys():
        t = charFreq[s]
    elif l in charFreq.keys() and not s in charFreq.keys():
        t = charFreq[l]
    else:
        t = 0
    return t

# din_vowels = {
#     "a": add_counts("a", "aa"),
#     "ä": add_counts("ä", "ää"),
#     "e": add_counts("e", "ee"),
#     "ë": add_counts("ë", "ëë"),
#     "ɛ": add_counts("ɛ", "ɛɛ"),
#     "ɛ̈": add_counts("ɛ̈", "ɛ̈ɛ̈"),
#     "i": add_counts("i", "ii"),
#     "ï": add_counts("ï", "ïï"),
#     "o": add_counts("o", "oo"),
#     "ö": add_counts("ö", "öö"),
#     "ɔ": add_counts("ɔ", "ɔɔ"),
#     "ɔ̈": add_counts("ɔ̈", "ɔ̈ɔ̈"),
#     "u": add_counts("u", "uu")
# }

# print("\nVowel frequency (length collapsed): ", din_vowels)

print("\nFrequency of letters in alphabetic order: ", {key:charFreq[key] for key in sorted(charFreq.keys(), key=collator.getSortKey)})

ngraph_ipa_map provides a mapping between digraphs and their IPA value. The IPA value needs to equate to a single grapheme. It is necessary to add new languages as required. If a map isn't specified or set to None then the standrad grapheme or character tokenisation is used.

andjc/letter_frequency.md

aldat commented Mar 16, 2022