Last active
August 8, 2023 07:06
-
-
Save andjc/760e2e711716076a7799794b40fce33c to your computer and use it in GitHub Desktop.
For a specific text (string) identify and count occurances of (character or grapheme based) ngraphs in text
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from collections import Counter | |
import regex | |
class ngraphs: | |
"""Calculate ngraph occurrences for target string | |
Attributes | |
---------- | |
text: str | |
A plain text string to be analysed. Specific to ngraph instance. | |
size: int | |
Size of ngraph. 2 = digraph, 3 = character, etc. Defaults to 2 | |
filter: bool | |
Filter out punctuation and whitespace, so that these characters do not appear | |
in the ngraphs. Defaults to False | |
count: int | |
graphemes: bool | |
Whether ngraphs are calculated on basis of number of characters, or number of graphemes. | |
Defaults to False. | |
Methods | |
------- | |
most_common() | |
Dictionary containing the _count_ most frequent ngraphs. Returns dictionary of | |
ngraphs, count of occurrence of ngraphs. | |
ngraph_list() | |
Return list of ngraphs generated from _text_. | |
""" | |
def __init__(self, text, size=2, filter=False, count=10, graphemes=False): | |
self._text = text | |
self.size = size | |
self.filter = filter | |
self.count = count | |
self.graphemes = graphemes | |
self.data | |
@property | |
def data(self): | |
self._data = self._frequency() | |
return self._data | |
@property | |
def text(self): | |
return self._text | |
@text.setter | |
def text(self, value): | |
# self._text = value | |
raise Exception("Cannot set text. Require new instance of ngraphs.") | |
@property | |
def size(self): | |
return self._size | |
@size.setter | |
def size(self, value): | |
self._size = value | |
@property | |
def filter(self): | |
return self._filter | |
@filter.setter | |
def filter(self, value): | |
self._filter = value | |
@property | |
def grapheme(self): | |
return self._grapheme | |
@grapheme.setter | |
def grapheme(self): | |
return self._grapheme | |
@grapheme.setter | |
def grapheme(self, value): | |
# self._grapheme = value | |
raise Exception("Cannot set grapheme. Require new instance of ngraphs.") | |
@property | |
def count(self): | |
return self._count | |
@count.setter | |
def count(self, value): | |
self._count = value | |
def __str__(self): | |
return f"size: {self.size} , filter: {self.filter} , count: {self.count}" | |
def _frequency(self): | |
# Identify ngraphs in text and count number of occurrences of each ngraph | |
pattern = f'[^\p\u007bP\u007d\p\u007bZ\u007d]\u007b{self.size}\u007d' | |
r = {} | |
if self.graphemes: | |
gr = regex.findall(r'\X', self.text) | |
c = {"".join(i for i in k): v for k, v in dict(Counter(tuple(gr)[idx : idx + self.size] for idx in range(len(gr) - 1))).items()} | |
else: | |
c = Counter(self.text[idx : idx + self.size] for idx in range(len(self.text) - 1)) | |
r = {x: count for x, count in c.items() if regex.match(pattern, x)} if self.filter else dict(c) | |
r = dict(sorted(r.items(), key=lambda x:x[1], reverse=True)) | |
return r | |
# return {"size":self.size, "filter":self.filter ,"ngraths": r} | |
# def _frequency_percentage(self, value): | |
# pdata = {k: round(v*100/self.total(), 6) for k,v in self.data.items()} | |
# return None | |
# def _percentage(self, value): | |
# return round(value*100/self.total(), 4) | |
def most_common(self, value=None): | |
if value and value != self.count: | |
self._count = value | |
return dict(list([self.data].items())[0: self.count]) | |
def to_list(self): | |
# Convert data keys to list, i.e. list of ngraths | |
return [i for i in self.data.keys()] | |
def to_tuples(self): | |
# Convert data dictionary to a list of tuples. | |
# return [(k, v, self._percentage(v)) for k, v in self.data.items()] | |
return [(k, v) for k, v in self.data.items()] | |
def ngraph_length(self): | |
# Number of unique ngraphs in data | |
return len(self.data) | |
def text_length(self): | |
# Length (number of characters) of text | |
return len(self.text) | |
def total(self): | |
# Total number of ngraphs available in string | |
return sum(self.data.values()) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
s = "አንቀጽ፡፩፤ የሰው፡ልጅ፡ሁሉ፡ሲወለድ፡ነጻና፡በክብርና፡በመብትም፡እኩልነት፡ያለው፡ነው።፡የተፈጥሮ፡ማስተዋልና፡ሕሊና፡ስላለው፡አንዱ፡ሌላውን፡በወንድማማችነት፡መንፈስ፡መመልከት፡ይገባዋል። አንቀጽ፡፪፤ እያንዳንዱ፡ሰው፡የዘር፡የቀለም፡የጾታ፡የቋንቋ፡የሃይማኖት፡የፖለቲካ፡ወይም፡የሌላ፡ዓይነት፡አስተሳሰብ፡የብሔራዊ፡ወይም፡የኀብረተሰብ፡ታሪክ፡የሀብት፡የትውልድ፡ወይም፡የሌላ፡ደረጃ፡ልዩነት፡ሳይኖሩ፡በዚሁ፡ውሳኔ፡የተዘረዘሩት፡መብቶችንና፡ነጻነቶች፡ሁሉ፡እንዲከበሩለት፡ይገባል። ከዚህም፡በተቀረ፡አንድ፡ሰው፡ከሚኖርበት፡አገር፡ወይም፡ግዛት፡የፖለቲካ፡የአገዛዝ፡ወይም፡የኢንተርናሽናል፡አቋም፡የተነሳ፡አገሩ፡ነጻም፡ሆነ፡በሞግዚትነት፡አስተዳደር፡ወይም፡እራሱን፡ችሎ፡የማይተዳደር፡አገር፡ተወላጅ፡ቢሆንም፡በማንኛውም፡ዓይነት፡ገደብ፡ያለው፡አገዛዝ፡ሥር፡ቢሆንም፡ልዩነት፡አይፈጸምበትም።" | |
z = ngraphs(s, filter=True) | |
# 20 most frequent trigraphs | |
common_digraphs = z.most_common(20) | |
print(common_digraphs) | |
# {'ነት': 7, 'ወይ': 6, 'ይም': 6, 'አገ': 5, 'አን': 4, 'ሰው': 3, 'ነጻ': 3, 'ለው': 3, 'የተ': 3, 'ስተ': 3, 'ሌላ': 3, 'ንቀ': 2, 'ቀጽ': 2, 'ሁሉ': 2, 'ርና': 2, 'መብ': 2, 'ብት': 2, 'ትም': 2, 'ያለ': 2, 'ዋል': 2} | |
# | |
# Trigraphs including trigraphs with whitespace or punctuation. | |
# | |
z.size = 3 | |
z.filter = False | |
z.count = 15 | |
common_trigraphs = z.most_common() | |
# 15 most frequent trigraphs | |
print(common_trigraphs) | |
# {'ነት፡': 7, 'ም፡የ': 6, '፡ወይ': 6, 'ወይም': 6, 'ይም፡': 6, 'ት፡አ': 4, '፡አገ': 4, 'ሰው፡': 3, '፡ነጻ': 3, 'ለው፡': 3, '፡የተ': 3, 'ት፡የ': 3, 'አንቀ': 2, 'ንቀጽ': 2, 'ቀጽ፡': 2} | |
# | |
# Trigraphs excluding trigraphs with whitespace or punctuation | |
# | |
z.filter = True | |
# 15 most frequent trigraphs | |
common_trigraphs = z.most_common() | |
print(common_trigraphs) | |
# {'ወይም': 6, 'አንቀ': 2, 'ንቀጽ': 2, 'ያለው': 2, 'ይገባ': 2, 'የፖለ': 2, 'ፖለቲ': 2, 'ለቲካ': 2, 'የሌላ': 2, 'ዓይነ': 2, 'ይነት': 2, 'አስተ': 2, 'ልዩነ': 2, 'ዩነት': 2, 'አገር': 2} | |
print(z.data) | |
# {'ወይም': 6, 'አንቀ': 2, 'ንቀጽ': 2, 'ያለው': 2, 'ይገባ': 2, 'የፖለ': 2, 'ፖለቲ': 2, 'ለቲካ': 2, 'የሌላ': 2, 'ዓይነ': 2, 'ይነት': 2, 'አስተ': 2, 'ልዩነ': 2, 'ዩነት': 2, 'አገር': 2, 'አገዛ': 2, 'ገዛዝ': 2, 'ተዳደ': 2, 'ዳደር': 2, 'ቢሆን': 2, 'ሆንም': 2, 'የሰው': 1, 'ሲወለ': 1, 'ወለድ': 1, 'ነጻና': 1, 'በክብ': 1, 'ክብር': 1, 'ብርና': 1, 'በመብ': 1, 'መብት': 1, 'ብትም': 1, 'እኩል': 1, 'ኩልነ': 1, 'ልነት': 1, 'የተፈ': 1, 'ተፈጥ': 1, 'ፈጥሮ': 1, 'ማስተ': 1, 'ስተዋ': 1, 'ተዋል': 1, 'ዋልና': 1, 'ሕሊና': 1, 'ስላለ': 1, 'ላለው': 1, 'አንዱ': 1, 'ሌላው': 1, 'ላውን': 1, 'በወን': 1, 'ወንድ': 1, 'ንድማ': 1, 'ድማማ': 1, 'ማማች': 1, 'ማችነ': 1, 'ችነት': 1, 'መንፈ': 1, 'ንፈስ': 1, 'መመል': 1, 'መልከ': 1, 'ልከት': 1, 'ገባዋ': 1, 'ባዋል': 1, 'እያን': 1, 'ያንዳ': 1, 'ንዳን': 1, 'ዳንዱ': 1, 'የዘር': 1, 'የቀለ': 1, 'ቀለም': 1, 'የጾታ': 1, 'የቋን': 1, 'ቋንቋ': 1, 'የሃይ': 1, 'ሃይማ': 1, 'ይማኖ': 1, 'ማኖት': 1, 'ስተሳ': 1, 'ተሳሰ': 1, 'ሳሰብ': 1, 'የብሔ': 1, 'ብሔራ': 1, 'ሔራዊ': 1, 'የኀብ': 1, 'ኀብረ': 1, 'ብረተ': 1, 'ረተሰ': 1, 'ተሰብ': 1, 'ታሪክ': 1, 'የሀብ': 1, 'ሀብት': 1, 'የትው': 1, 'ትውል': 1, 'ውልድ': 1, 'ደረጃ': 1, 'ሳይኖ': 1, 'ይኖሩ': 1, 'በዚሁ': 1, 'ውሳኔ': 1, 'የተዘ': 1, 'ተዘረ': 1, 'ዘረዘ': 1, 'ረዘሩ': 1, 'ዘሩት': 1, 'መብቶ': 1, 'ብቶች': 1, 'ቶችን': 1, 'ችንና': 1, 'ነጻነ': 1, 'ጻነቶ': 1, 'ነቶች': 1, 'እንዲ': 1, 'ንዲከ': 1, 'ዲከበ': 1, 'ከበሩ': 1, 'በሩለ': 1, 'ሩለት': 1, 'ገባል': 1, 'ከዚህ': 1, 'ዚህም': 1, 'በተቀ': 1, 'ተቀረ': 1, 'አንድ': 1, 'ከሚኖ': 1, 'ሚኖር': 1, 'ኖርበ': 1, 'ርበት': 1, 'ግዛት': 1, 'የአገ': 1, 'የኢን': 1, 'ኢንተ': 1, 'ንተር': 1, 'ተርና': 1, 'ርናሽ': 1, 'ናሽና': 1, 'ሽናል': 1, 'አቋም': 1, 'የተነ': 1, 'ተነሳ': 1, 'አገሩ': 1, 'ነጻም': 1, 'በሞግ': 1, 'ሞግዚ': 1, 'ግዚት': 1, 'ዚትነ': 1, 'ትነት': 1, 'ስተዳ': 1, 'እራሱ': 1, 'ራሱን': 1, 'የማይ': 1, 'ማይተ': 1, 'ይተዳ': 1, 'ተወላ': 1, 'ወላጅ': 1, 'በማን': 1, 'ማንኛ': 1, 'ንኛው': 1, 'ኛውም': 1, 'ገደብ': 1, 'አይፈ': 1, 'ይፈጸ': 1, 'ፈጸም': 1, 'ጸምበ': 1, 'ምበት': 1, 'በትም': 1} | |
z.to_list() | |
# ['ወይም', 'አንቀ', 'ንቀጽ', 'ያለው', 'ይገባ', 'የፖለ', 'ፖለቲ', 'ለቲካ', 'የሌላ', 'ዓይነ', 'ይነት', 'አስተ', 'ልዩነ', 'ዩነት', 'አገር', 'አገዛ', 'ገዛዝ', 'ተዳደ', 'ዳደር', 'ቢሆን', 'ሆንም', 'የሰው', 'ሲወለ', 'ወለድ', 'ነጻና', 'በክብ', 'ክብር', 'ብርና', 'በመብ', 'መብት', 'ብትም', 'እኩል', 'ኩልነ', 'ልነት', 'የተፈ', 'ተፈጥ', 'ፈጥሮ', 'ማስተ', 'ስተዋ', 'ተዋል', 'ዋልና', 'ሕሊና', 'ስላለ', 'ላለው', 'አንዱ', 'ሌላው', 'ላውን', 'በወን', 'ወንድ', 'ንድማ', 'ድማማ', 'ማማች', 'ማችነ', 'ችነት', 'መንፈ', 'ንፈስ', 'መመል', 'መልከ', 'ልከት', 'ገባዋ', 'ባዋል', 'እያን', 'ያንዳ', 'ንዳን', 'ዳንዱ', 'የዘር', 'የቀለ', 'ቀለም', 'የጾታ', 'የቋን', 'ቋንቋ', 'የሃይ', 'ሃይማ', 'ይማኖ', 'ማኖት', 'ስተሳ', 'ተሳሰ', 'ሳሰብ', 'የብሔ', 'ብሔራ', 'ሔራዊ', 'የኀብ', 'ኀብረ', 'ብረተ', 'ረተሰ', 'ተሰብ', 'ታሪክ', 'የሀብ', 'ሀብት', 'የትው', 'ትውል', 'ውልድ', 'ደረጃ', 'ሳይኖ', 'ይኖሩ', 'በዚሁ', 'ውሳኔ', 'የተዘ', 'ተዘረ', 'ዘረዘ', 'ረዘሩ', 'ዘሩት', 'መብቶ', 'ብቶች', 'ቶችን', 'ችንና', 'ነጻነ', 'ጻነቶ', 'ነቶች', 'እንዲ', 'ንዲከ', 'ዲከበ', 'ከበሩ', 'በሩለ', 'ሩለት', 'ገባል', 'ከዚህ', 'ዚህም', 'በተቀ', 'ተቀረ', 'አንድ', 'ከሚኖ', 'ሚኖር', 'ኖርበ', 'ርበት', 'ግዛት', 'የአገ', 'የኢን', 'ኢንተ', 'ንተር', 'ተርና', 'ርናሽ', 'ናሽና', 'ሽናል', 'አቋም', 'የተነ', 'ተነሳ', 'አገሩ', 'ነጻም', 'በሞግ', 'ሞግዚ', 'ግዚት', 'ዚትነ', 'ትነት', 'ስተዳ', 'እራሱ', 'ራሱን', 'የማይ', 'ማይተ', 'ይተዳ', 'ተወላ', 'ወላጅ', 'በማን', 'ማንኛ', 'ንኛው', 'ኛውም', 'ገደብ', 'አይፈ', 'ይፈጸ', 'ፈጸም', 'ጸምበ', 'ምበት', 'በትም'] |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# African languages may contain a mixture of composed and | |
# precomposed characters, so ngraphs may contain unpaired combining diacritics. | |
t = 'dɛ̈tëicëkäŋ akɔ̈ɔ̈n' | |
tc = ngraphs(t) | |
print(tc.data) | |
# {'ɔ̈': 2, 'dɛ': 1, 'ɛ̈': 1, '̈t': 1, 'të': 1, 'ëi': 1, 'ic': 1, 'cë': 1, 'ëk': 1, 'kä': 1, | |
# 'äŋ': 1, 'ŋ ': 1, ' a': 1, 'ak': 1, 'kɔ': 1, '̈ɔ': 1, '̈n': 1} | |
# If we change from character based ngraphs to grapheme based ngraphs, we sidestep the | |
# issue with unpaired diacritics. | |
tg = ngraphs(t, graphemes=True) | |
print(tg.data) | |
# {'dɛ̈': 1, 'ɛ̈t': 1, 'të': 1, 'ëi': 1, 'ic': 1, 'cë': 1, 'ëk': 1, 'kä': 1, 'äŋ': 1, 'ŋ ': 1, | |
# ' a': 1, 'ak': 1, 'kɔ̈': 1, 'ɔ̈ɔ̈': 1, 'ɔ̈n': 1} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Grapheme support and compete documentation to be added.