mhmd-azeez · January 6, 2022 21:04
diff --git a/output.txt b/output.txt
 ▁ئێمە▁ش -> ▁ئێمە▁
 دەچینەوە -> چ
 ▁بۆ▁ -> ▁بۆ▁
 ولاتی -> ولات
 ▁خۆم▁ان -> ▁خۆم▁
 لێشیانخۆشبووین -> شیانخۆشبوو
 لێشتانخۆشبووم -> شتانخۆشبوو
diff --git a/test.py b/test.py
 from klpt.stem import Stem
 from klpt.tokenize import Tokenize
 from klpt.preprocess import Preprocess

 text = """
 ئێمەش دەچینەوە بۆ ولاتی خۆمان
 لێشیانخۆشبووین
 لێشتانخۆشبووم
 """

 preprocessor = Preprocess("Sorani", "Arabic", numeral="Latin")

 text = preprocessor.normalize(text)
 text = preprocessor.standardize(text)
 text = preprocessor.unify_numerals(text)

 words = text.split()

 stemmer = Stem("Sorani", "Arabic")
 tokernizer = Tokenize("Sorani", "Arabic")

 for token in tokernizer.word_tokenize(text):
    print(token, '->', stemmer.stem(token)[0])
	from klpt.stem import Stem
	from klpt.tokenize import Tokenize
	from klpt.preprocess import Preprocess

	text = """
	ئێمەش دەچینەوە بۆ ولاتی خۆمان
	لێشیانخۆشبووین
	لێشتانخۆشبووم
	"""

	preprocessor = Preprocess("Sorani", "Arabic", numeral="Latin")

	text = preprocessor.normalize(text)
	text = preprocessor.standardize(text)
	text = preprocessor.unify_numerals(text)

	words = text.split()

	stemmer = Stem("Sorani", "Arabic")
	tokernizer = Tokenize("Sorani", "Arabic")

	for token in tokernizer.word_tokenize(text):
	print(token, '->', stemmer.stem(token)[0])
No results found