Based on python code posted on LinkedIn by Alekya D.
Using a tweaked version of Alice in Wonderland and the Dinka Padang translation of the UDHR
Refer to gists on graphemes and isalpha
import collections
import regex as re
def isalpha_(text):
if len(text) == 1:
result = bool(re.match(r'[\p{Alphabetic}\p{Mn}\p{Mc}·]', text))
else:
result = bool(re.match(r'^\p{Alphabetic}[\p{Alphabetic}\p{Mn}\p{Mc}·]*$', text))
return result
chars = re.findall(r'\X', open("alice_in_wonderland.txt", "r").read().lower())
charFreq = dict(collections.Counter(filter(lambda c: isalpha_(c), chars)))
print("Most frequent letter: ", max(charFreq, key = charFreq.get))
print("Most infrequent letter: ", min(charFreq, key = charFreq.get))
print("\nThe frequency of English letters:", charFreq)
# print("\nAlphabet: ", {key:charFreq[key] for key in sorted(charFreq.keys())})
The Python str.lower()
casing operation is language invariant, it does not implement language specific tailorings to casing.
In order to improve lowercasing we can use PyICU, create a Locale object and lowercase text using the desired locale:
import collections
import regex as re
from icu import Locale, UnicodeString
DEFAULT_LOCALE = Locale("en_AU")
# ROOT_LOCALE = Locale.getRoot()
def isalpha_(text):
if len(text) == 1:
result = bool(re.match(r'[\p{Alphabetic}\p{Mn}\p{Mc}·]', text))
else:
result = bool(re.match(r'^\p{Alphabetic}[\p{Alphabetic}\p{Mn}\p{Mc}·]*$', text))
return result
chars = re.findall(r'\X', open("alice_in_wonderland.txt", "r").read())
chars = [str(UnicodeString(x).toLower(DEFAULT_LOCALE)) for x in chars]
charFreq = dict(collections.Counter(filter(lambda c: isalpha_(c), chars)))
print("Most frequent letter: ", max(charFreq, key = charFreq.get))
print("Most infrequent letter: ", min(charFreq, key = charFreq.get))
print("\nThe frequency of English letters:", charFreq)
# print("\nAlphabet: ", {key:charFreq[key] for key in sorted(charFreq.keys())})
A further enhancement would be an option to choose whether you count graphemes or characters:
import collections
import regex as re
from icu import Locale, UnicodeString
DEFAULT_LOCALE = Locale("en_AU")
# ROOT_LOCALE = Locale.getRoot()
def isalpha_(text):
if len(text) == 1:
result = bool(re.match(r'[\p{Alphabetic}\p{Mn}\p{Mc}·]', text))
else:
result = bool(re.match(r'^\p{Alphabetic}[\p{Alphabetic}\p{Mn}\p{Mc}·]*$', text))
return result
def letter_frequency(doc, grapheme=True):
chars = re.findall(r'\X', doc) if grapheme else [char for char in doc]
chars = [str(UnicodeString(x).toLower(DEFAULT_LOCALE)) for x in chars]
return dict(collections.Counter(filter(lambda c: isalpha_(c), chars)))
with open('alice_in_wonderland.txt') as f:
doc = f.read()
charFreq = letter_frequency(doc, grapheme=False)
print("Most frequent letter: ", max(charFreq, key = charFreq.get))
print("Most infrequent letter: ", min(charFreq, key = charFreq.get))
print("\nThe frequency of English letters:", charFreq)
# print("\nAlphabet: ", {key:charFreq[key] for key in sorted(charFreq.keys())})
An attempt to handle languages that have digraphs or trigraphs as alphabetic characters:
from collections import OrderedDict, Counter
import regex as re
from icu import Locale, UnicodeString, Collator
#LOCALE_ID = "en_AU"
LOCALE_ID = "din_SS"
if LOCALE_ID in Locale.getAvailableLocales().keys():
DEFAULT_LOCALE = Locale(LOCALE_ID)
else:
DEFAULT_LOCALE = Locale.getRoot()
if LOCALE_ID in Collator.getAvailableLocales().keys():
collator = Collator.createInstance(Locale(LOCALE_ID))
else:
collator = Collator.createInstance(Locale.getRoot())
ngraph_ipa_map = {
"din": {
"ngraphs": {
"ää": 'ā̤',
"aa": "ā",
"ëë": "ē̤",
"ee": "ē",
"ɛ̈ɛ̈": "ɛ̤̄",
"ɛɛ": "ɛ̄",
"ïï": "ī̤",
"ii": "ī",
"öö": "ō̤",
"oo": "ō",
"ɔ̈ɔ̈": "ɔ̤̄",
"ɔɔ": "ɔ̄",
"uu": "ū",
"dh": "d̪",
"nh": "n̪",
"ny": "ɲ",
"th": "t̪"
},
"regex": {}
}
}
def apply_map(s, map, reverse=False):
od_map = OrderedDict(ngraph_ipa_map[map]["ngraphs"])
if reverse:
for i, j in od_map.items():
s = s.replace(j, i)
else:
for i, j in od_map.items():
s = s.replace(i, j)
return s
def isalpha_(text):
if len(text) == 1:
result = bool(re.match(r'[\p{Alphabetic}\p{Mn}\p{Mc}·]', text))
else:
result = bool(re.match(r'^\p{Alphabetic}[\p{Alphabetic}\p{Mn}\p{Mc}·]*$', text))
return result
def letter_frequency(doc, grapheme=True, map=None):
if map:
doc = apply_map(doc, map)
chars = re.findall(r'\X', doc) if grapheme else [char for char in doc]
if map:
chars = [apply_map(str(UnicodeString(x).toLower(DEFAULT_LOCALE)), map, reverse=True) for x in chars]
else:
chars = [str(UnicodeString(x).toLower(DEFAULT_LOCALE)) for x in chars]
return dict(Counter(filter(lambda c: isalpha_(c), chars)))
# text_file = "alice_in_wonderland.txt"
# map = None
text_file = 'udhr_dip.txt'
map = "din"
with open(text_file) as f:
doc = f.read()
charFreq = letter_frequency(doc, grapheme=True, map=map)
max_value = max(charFreq.items(), key=lambda x: x[1])[1]
print("Most frequent letter(s): ", [k for k,v in charFreq.items() if v == max_value])
min_value = min(charFreq.items(), key=lambda x: x[1])[1]
print("Most infrequent letter(s): ", [k for k,v in charFreq.items() if v == min_value])
print("\nThe frequency of the letters:", str({k: v for k, v in sorted(charFreq.items(), key=lambda item: item[1], reverse=True)}))
def add_counts(s, l):
if s in charFreq.keys() and l in charFreq.keys():
t = charFreq[s] + charFreq[l]
elif s in charFreq.keys() and not l in charFreq.keys():
t = charFreq[s]
elif l in charFreq.keys() and not s in charFreq.keys():
t = charFreq[l]
else:
t = 0
return t
# din_vowels = {
# "a": add_counts("a", "aa"),
# "ä": add_counts("ä", "ää"),
# "e": add_counts("e", "ee"),
# "ë": add_counts("ë", "ëë"),
# "ɛ": add_counts("ɛ", "ɛɛ"),
# "ɛ̈": add_counts("ɛ̈", "ɛ̈ɛ̈"),
# "i": add_counts("i", "ii"),
# "ï": add_counts("ï", "ïï"),
# "o": add_counts("o", "oo"),
# "ö": add_counts("ö", "öö"),
# "ɔ": add_counts("ɔ", "ɔɔ"),
# "ɔ̈": add_counts("ɔ̈", "ɔ̈ɔ̈"),
# "u": add_counts("u", "uu")
# }
# print("\nVowel frequency (length collapsed): ", din_vowels)
print("\nFrequency of letters in alphabetic order: ", {key:charFreq[key] for key in sorted(charFreq.keys(), key=collator.getSortKey)})
ngraph_ipa_map
provides a mapping between digraphs and their IPA value. The IPA value needs to equate to a single grapheme. It is necessary to add new languages as required. If a map isn't specified or set to None
then the standrad grapheme or character tokenisation is used.
Very useful! Thank you @andjc