Created
April 16, 2024 03:44
-
-
Save ar-pa/3222e1f2679ec540a8070f972ed96e90 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import json | |
from collections import Counter | |
from pyarabic import araby | |
from pyarabic.araby import strip_diacritics | |
quran = json.load(open('quran.json')) | |
def normalize_arabic(text): | |
# Normalize the text by reducing different forms of Alif and other characters | |
text = araby.strip_tashkeel(text) # Remove diacritics | |
text = araby.strip_tatweel(text) # Remove Tatweel (kashida) | |
text = araby.strip_diacritics(text) | |
# Replace specific characters to their simpler forms | |
replacements = { | |
'آ': 'ا', | |
'أ': 'ا', | |
'إ': 'ا', | |
'ٱ': 'ا', | |
'ى': 'ي', | |
'ة': 'ه', | |
'ئ': 'ي', | |
'ؤ': 'و', | |
} | |
for k, v in replacements.items(): | |
text = text.replace(k, v) | |
return text | |
for chapter in quran: | |
if len(normalize_arabic(chapter['verses'][0]['text'])) > 8: | |
continue | |
print(chapter['name'], chapter['verses'][0]['text']) | |
whole = normalize_arabic(''.join(verse['text'] for verse in chapter['verses'])) | |
count = Counter(whole) | |
del count[' '] | |
sorted_chars = sorted(count.items(), key=lambda item: item[1], reverse=True) | |
for char, freq in sorted_chars[:5]: | |
print(f'Character: {char}, ord: {ord(char)}, Count: {freq}') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Download Quran from: https://cdn.jsdelivr.net/npm/[email protected]/dist/quran.json
This was a try to see if رشاد خلیفه was right.