Created
June 13, 2018 13:44
-
-
Save lovasoa/cfa308bbd5799930f8450d571d22edc1 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import Data.Maybe | |
import Data.List | |
import Data.Ord | |
data DicTrie = DicTrie [(Char, DicTrie)] deriving Show | |
emptyTrie = DicTrie [] | |
mainDict = DicTrie [ | |
('e', DicTrie[ | |
('n', DicTrie[ | |
('t', emptyTrie) | |
]) | |
]), | |
('t', DicTrie[ | |
('r', DicTrie[ | |
('e', emptyTrie) | |
]) | |
]) | |
] | |
lookupTrie :: Char -> DicTrie -> Maybe DicTrie | |
lookupTrie chr (DicTrie dct) = lookup chr dct | |
type Tokens = [String] | |
pathCost :: Tokens -> (Int, Int) | |
pathCost xs = (length xs, -foldl max 0 (map length xs)) | |
tokenize :: DicTrie -> String -> Tokens | |
tokenize _ "" = [] | |
tokenize dict str = minimumBy (comparing pathCost) (map (\(prefix,rest) -> prefix : tokenize dict rest) $ prefixes dict str) | |
prefixes :: DicTrie -> String -> [(String,String)] | |
prefixes dct str = | |
let | |
(pfx, sstr) = splitPrefixAlways dct str | |
pfxs = drop 1 $ zip (inits pfx) (tails pfx) | |
in map (\(init,tail) -> (init, tail ++ sstr)) pfxs | |
splitPrefixAlways :: DicTrie -> String -> (String, String) | |
splitPrefixAlways dct str = | |
let (pfx,sstr) = splitPrefix dct str | |
in if null pfx then splitAt 1 str else (pfx,sstr) | |
splitPrefix :: DicTrie -> String -> (String, String) | |
splitPrefix dict "" = ("", "") | |
splitPrefix dict (chr:str) = | |
case lookupTrie chr dict of | |
Just subdict -> | |
let (nextPfx, sstr) = splitPrefix subdict str | |
in (chr:nextPfx, sstr) | |
Nothing -> ("", chr:str) | |
main = print $ tokenize mainDict "entraineraient tres tres bien entre les trognons qui rentrent" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment