Last active
August 29, 2015 14:04
-
-
Save joom/1780cf61e7786ac7cf64 to your computer and use it in GitHub Desktop.
Turkish NLP: Split a word to its syllables
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import Data.Maybe (isNothing, fromJust) | |
import Data.Char (isAlpha, toLower) | |
import Data.List (findIndex) | |
-- charAt xs i = Just x, where x character at index i | |
-- Nothing, is i is out of bounds | |
charAt :: String -> Int -> Maybe Char | |
charAt xs i = if length xs > i then Just (xs !! i) else Nothing | |
-- isVowel x = True if x is a vowel, | |
-- False otherwise | |
isVowel :: Char -> Bool | |
isVowel x = x `elem` "aeiıoöuüâêîû" | |
-- substring x y xs = Java's substring | |
substring :: Int -> Int -> String -> String | |
substring x y = drop x . take y | |
-- syllablize xs = list of Turkish syllables of xs | |
syllablize :: String -> [String] | |
syllablize s | |
| '\'' `elem` s = concatMap syllablize [takeWhile (/='\'') s, dropWhile (/='\'') s] | |
| isNothing firstVowelIndex = [xs] | |
| any isNothing [afterVowel 1] = [xs] | |
| isVowel(fromJust $ afterVowel 1) = | |
substring 0 (fVI + 1) xs : syllablize(substring (fVI + 1) len xs) | |
| any isNothing [afterVowel 2] = [xs] | |
| isVowel(fromJust $ afterVowel 2) = | |
substring 0 (fVI + 1) xs : syllablize(substring (fVI + 1) len xs) | |
| any isNothing [afterVowel 3] = [xs] | |
| isVowel(fromJust $ afterVowel 3) = | |
substring 0 (fVI + 2) xs : syllablize(substring (fVI + 2) len xs) | |
| lastPart `elem` ["str", "ktr", "ntr", "nsp"] = | |
substring 0 (fVI + 2) xs : syllablize(substring (fVI + 2) len xs) | |
| otherwise = | |
substring 0 (fVI + 3) xs : syllablize(substring (fVI + 3) len xs) | |
where xs = (filter isAlpha . map toLower) s | |
firstVowelIndex = findIndex isVowel xs | |
fVI = fromJust firstVowelIndex | |
len = length xs | |
lastPart = substring (len + 1) (len + 4) xs | |
afterVowel i = fromJust $ fmap (charAt xs . (+i)) firstVowelIndex |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Enhanced version: https://github.com/joom/Guguk/blob/master/src/Guguk/Syllabification.hs
The tests for it (they all pass): https://github.com/joom/Guguk/blob/master/tests/Syllabification.hs