Created
June 10, 2013 07:55
-
-
Save lattejed/5747158 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{- | |
Step 2 | |
(m>0) ATIONAL -> ATE | |
... | |
(m>0) BILITI -> BLE | |
-} | |
step2 :: String -> String | |
step2 str = | |
if (not $ null sfxs') && (measure $ snd $ wordDesc $ stm (fst sfx)) > 0 | |
then swapsfx str (fst sfx) (snd sfx) | |
else str | |
where f = \ss -> not $ fst ss `isSuffixOf` toLowers str | |
stm = getstem str | |
sfx = head $ sfxs' | |
sfxs' = dropWhile f sfxs | |
sfxs = [ | |
("ational", "ate" ), | |
("tional" , "tion"), | |
("enci" , "ence"), | |
("anci" , "ance"), | |
("izer" , "ize" ), | |
("abli" , "able"), | |
("alli" , "al" ), | |
("entli" , "ent" ), | |
("eli" , "e" ), | |
("ousli" , "ous" ), | |
("ization", "ize" ), | |
("ation" , "ate" ), | |
("ator" , "ate" ), | |
("alism" , "al" ), | |
("iveness", "ive" ), | |
("fulness", "ful" ), | |
("ousness", "ous" ), | |
("aliti" , "al" ), | |
("iviti" , "ive" ), | |
("biliti" , "ble" )] | |
{- | |
Step 3 | |
(m>0) ICATE -> IC | |
... | |
(m>0) NESS -> | |
-} | |
step3 :: String -> String | |
step3 str = | |
if (not $ null sfxs') && (measure $ snd $ wordDesc $ stm (fst sfx)) > 0 | |
then swapsfx str (fst sfx) (snd sfx) | |
else str | |
where f = \ss -> not $ fst ss `isSuffixOf` toLowers str | |
stm = getstem str | |
sfx = head $ sfxs' | |
sfxs' = dropWhile f sfxs | |
sfxs = [ | |
("icate", "ic"), | |
("ative", "" ), | |
("alize", "al"), | |
("iciti", "ic"), | |
("ical" , "ic"), | |
("ful" , "" ), | |
("ness" , "" )] | |
{- | |
Step 4 | |
(m>1) AL -> | |
... | |
(m>1 and (*S or *T)) ION -> | |
... | |
(m>1) IZE -> | |
-} | |
step4 :: String -> String | |
step4 str = | |
if (not $ null sfxs') && (measure $ snd $ wordDesc $ stm (fst sfx)) > 1 | |
then swapsfx str (fst sfx) (snd sfx) | |
else str | |
where f = \ss -> not $ fst ss `isSuffixOf` toLowers str | |
stm = getstem str | |
sfx = head $ sfxs' | |
sfxs' = dropWhile f sfxs | |
sfxs = [ | |
("al" , "" ), | |
("ance" , "" ), | |
("ence" , "" ), | |
("er" , "" ), | |
("ic" , "" ), | |
("able" , "" ), | |
("ible" , "" ), | |
("ant" , "" ), | |
("ement", "" ), | |
("ment" , "" ), | |
("ent" , "" ), | |
("sion" , "s"), | |
("tion" , "t"), | |
("ou" , "" ), | |
("ism" , "" ), | |
("ate" , "" ), | |
("iti" , "" ), | |
("ous" , "" ), | |
("ive" , "" ), | |
("ize" , "" )] | |
{- | |
Step 5a | |
(m>1) E -> | |
(m=1 and not *o) E -> | |
-} | |
step5a :: String -> String | |
step5a str | |
| sfx "e" && (measure $ snd $ wordDesc $ stm "e") > 1 = swp "e" "" | |
| sfx "e" && (measure $ snd $ wordDesc $ stm "e") == 1 && noto = swp "e" "" | |
| otherwise = str | |
where noto = not $ endscvc $ wordDesc $ stm "e" | |
stm = getstem str | |
sfx = (`isSuffixOf` toLowers str) | |
swp = swapsfx str | |
{- | |
Step 5b | |
(m > 1 and *d and *L) -> | |
-} | |
step5b :: String -> String | |
step5b str = | |
if (measure $ snd $ wordDesc str) > 1 && dblc && ends 'l' | |
then init str | |
else str | |
where dblc = endsdblc $ wordDesc str | |
ends = endswith str | |
{- | |
Our public stemming function | |
-} | |
stem :: String -> String | |
stem str = | |
steps str | |
where steps = step5 . step4 . step3 . step2 . step1 | |
step1 = step1c . step1b . step1a | |
step5 = step5b . step5a |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment