Last active
September 14, 2024 13:54
-
-
Save prio101/a2c631f80d56571f854a11e48d7d7b46 to your computer and use it in GitHub Desktop.
only between bn and en.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from langdetect import detect_langs | |
from collections import defaultdict | |
def split_english_bangla(mixed_string): | |
# To hold the detected languages and their corresponding text parts | |
language_dict = defaultdict(str) | |
# Split the string into words (or tokens) | |
words = mixed_string.split() | |
# Detect language for each word | |
for word in words: | |
try: | |
# Detect the language and confidence | |
detected_lang = detect_langs(word)[0] # The most confident language | |
lang = detected_lang.lang | |
# Only keep English ('en') and Bangla ('bn') | |
if lang == 'bn': | |
language_dict[lang] += word + " " | |
else: | |
language_dict['en'] += word + " " | |
except: | |
# In case language detection fails, mark the word as 'unknown' | |
language_dict['unknown'] += word + " " | |
# Return the dictionary of languages and their corresponding text parts | |
return {lang: text.strip() for lang, text in language_dict.items()} | |
# Example mixed string | |
mixed_string = "Hello world! এই পৃথিবী সুন্দর। How are you? তুমি কেমন আছো?" | |
# Split and detect languages | |
result = split_english_bangla(mixed_string) | |
print(result) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment