Skip to content

Instantly share code, notes, and snippets.

@lattejed
Last active December 18, 2015 07:29
Show Gist options
  • Save lattejed/5747233 to your computer and use it in GitHub Desktop.
Save lattejed/5747233 to your computer and use it in GitHub Desktop.
{-
The MIT License (MIT)
Copyright (c) 2103 Matthew Smith [email protected]
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
-}
module PorterStemmer
( main
, stem
) where
import Data.Char
import Data.List
import System.Environment (getArgs)
rotateR :: [a] -> [a]
rotateR [x] = [x]
rotateR xs = last xs : init xs
toLowers :: String -> String
toLowers str = map toLower str
-- Make a pattern to match the consonants and vowels in a word where a cons is
-- not "aeiou" or y preceded by a cons e.g., "happy" -> ("happy", "cvccv")
wordDesc :: String -> (String, String)
wordDesc str =
(str, [corv (l a, l b,i) | (a,b,i) <- zip3 str (rotateR str) [0..len]])
where l = toLower
len = length str - 1
corv (a,b,i)
| a == 'y' && i /= 0 && b `notElem` vs = 'v'
| a `elem` vs = 'v'
| otherwise = 'c'
where vs = "aeiou"
-- Measure the number of consonant sequences in the word in the form
-- [c]vcvc[v] == 2 where the inner 'vc' sequences are counted.
measure :: String -> Int
measure ds =
length $ filter (=='c') ds'
where ds' = dropWhile (=='c') [head a | a <- group ds]
-- Tests if our word or stem ends with a given character
endswith :: String -> Char -> Bool
endswith str c = c == (toLower $ last str)
-- Tests if our word or stem contains a vowel
-- Notation: *v*
hasvowel :: String -> Bool
hasvowel ds = 'v' `elem` ds
-- Tests if our word or stem ends with a double consonant
-- Notation: *d
endsdblc :: (String, String) -> Bool
endsdblc (str,ds) =
last ds == 'c' && (last $ init str) == last str
-- Tests if our word or stem ends with the pattern 'cvc' and does
-- not end with the characters 'x', 'w' or 'y'
-- Notation: *o
endscvc :: (String, String) -> Bool
endscvc (str,ds) =
drop (length ds - 3) ds == "cvc" && (toLower $ last str) `notElem` "xwy"
-- Get the stem of the word (the word minus a given suffix)
getstem :: String -> String -> String
getstem str sfx = take (length str - length sfx) str
-- Replace a given suffix with another suffix
swapsfx :: String -> String -> String -> String
swapsfx str sfx [] = take (length str - length sfx) str
swapsfx str sfx sfx' = take (length str - length sfx) str ++ sfx'
{-
Step 1a
SSES -> SS
IES -> I
SS -> SS
S ->
-}
step1a :: String -> String
step1a str =
if not $ null sfxs'
then swapsfx str (fst sfx) (snd sfx)
else str
where f = \ss -> not $ fst ss `isSuffixOf` toLowers str
sfx = head $ sfxs'
sfxs' = dropWhile f sfxs
sfxs = [
("sses", "ss"),
("ies" , "i" ),
("ss" , "ss"),
("s" , "" )]
{-
Step 1b
(m>0) EED -> EE
(*v*) ED ->
(*v*) ING ->
-}
step1b :: String -> String
step1b str
| sfx "eed" && (measure $ snd $ wordDesc $ stm "eed") > 0 = swp "eed" "ee"
| sfx "ed" && (hasvowel $ snd $ wordDesc $ stm "ed") = b' $ swp "ed" ""
| sfx "ing" && (hasvowel $ snd $ wordDesc $ stm "ing") = b' $ swp "ing" ""
| otherwise = str
where stm = getstem str
sfx = (`isSuffixOf` toLowers str)
swp = swapsfx str
b' = step1b' . wordDesc
{-
Step 1b (part two)
AT -> ATE
BL -> BLE
IZ -> IZE
(*d and not (*L or *S or *Z))
(m=1 and *o) -> E
-}
step1b' :: (String, String) -> String
step1b' (str,ds)
| or $ map sfx ["at", "bl", "iz"] = e
| dbl && (not . or $ map end "lsz") = init str
| measure ds == 1 && endscvc (str,ds) = e
| otherwise = str
where sfx = (`isSuffixOf` toLowers str)
dbl = endsdblc (str,ds)
end = endswith str
e = str ++ "e"
{-
Step 1c
(*v*) Y -> I
-}
step1c :: String -> String
step1c str
| (hasvowel $ snd $ wordDesc $ stm "y") && end 'y' = init str ++ "i"
| otherwise = str
where stm = getstem str
end = endswith str
{-
Step 2
(m>0) ATIONAL -> ATE
...
(m>0) BILITI -> BLE
-}
step2 :: String -> String
step2 str =
if (not $ null sfxs') && (measure $ snd $ wordDesc $ stm (fst sfx)) > 0
then swapsfx str (fst sfx) (snd sfx)
else str
where f = \ss -> not $ fst ss `isSuffixOf` toLowers str
stm = getstem str
sfx = head $ sfxs'
sfxs' = dropWhile f sfxs
sfxs = [
("ational", "ate" ),
("tional" , "tion"),
("enci" , "ence"),
("anci" , "ance"),
("izer" , "ize" ),
("abli" , "able"),
("alli" , "al" ),
("entli" , "ent" ),
("eli" , "e" ),
("ousli" , "ous" ),
("ization", "ize" ),
("ation" , "ate" ),
("ator" , "ate" ),
("alism" , "al" ),
("iveness", "ive" ),
("fulness", "ful" ),
("ousness", "ous" ),
("aliti" , "al" ),
("iviti" , "ive" ),
("biliti" , "ble" )]
{-
Step 3
(m>0) ICATE -> IC
...
(m>0) NESS ->
-}
step3 :: String -> String
step3 str =
if (not $ null sfxs') && (measure $ snd $ wordDesc $ stm (fst sfx)) > 0
then swapsfx str (fst sfx) (snd sfx)
else str
where f = \ss -> not $ fst ss `isSuffixOf` toLowers str
stm = getstem str
sfx = head $ sfxs'
sfxs' = dropWhile f sfxs
sfxs = [
("icate", "ic"),
("ative", "" ),
("alize", "al"),
("iciti", "ic"),
("ical" , "ic"),
("ful" , "" ),
("ness" , "" )]
{-
Step 4
(m>1) AL ->
...
(m>1 and (*S or *T)) ION ->
...
(m>1) IZE ->
-}
step4 :: String -> String
step4 str =
if (not $ null sfxs') && (measure $ snd $ wordDesc $ stm (fst sfx)) > 1
then swapsfx str (fst sfx) (snd sfx)
else str
where f = \ss -> not $ fst ss `isSuffixOf` toLowers str
stm = getstem str
sfx = head $ sfxs'
sfxs' = dropWhile f sfxs
sfxs = [
("al" , "" ),
("ance" , "" ),
("ence" , "" ),
("er" , "" ),
("ic" , "" ),
("able" , "" ),
("ible" , "" ),
("ant" , "" ),
("ement", "" ),
("ment" , "" ),
("ent" , "" ),
("sion" , "s"),
("tion" , "t"),
("ou" , "" ),
("ism" , "" ),
("ate" , "" ),
("iti" , "" ),
("ous" , "" ),
("ive" , "" ),
("ize" , "" )]
{-
Step 5a
(m>1) E ->
(m=1 and not *o) E ->
-}
step5a :: String -> String
step5a str
| sfx "e" && (measure $ snd $ wordDesc $ stm "e") > 1 = swp "e" ""
| sfx "e" && (measure $ snd $ wordDesc $ stm "e") == 1 && noto = swp "e" ""
| otherwise = str
where noto = not $ endscvc $ wordDesc $ stm "e"
stm = getstem str
sfx = (`isSuffixOf` toLowers str)
swp = swapsfx str
{-
Step 5b
(m > 1 and *d and *L) ->
-}
step5b :: String -> String
step5b str =
if (measure $ snd $ wordDesc str) > 1 && dblc && ends 'l'
then init str
else str
where dblc = endsdblc $ wordDesc str
ends = endswith str
{-
Our public stemming function
-}
stem :: String -> String
stem str =
steps str
where steps = step5 . step4 . step3 . step2 . step1
step1 = step1c . step1b . step1a
step5 = step5b . step5a
{-
Command line usage:
porterstemmer.hs infile.txt outfile.txt
-}
main :: IO ()
main = do
args <- getArgs
content <- readFile $ head args
writeFile (head $ tail args) $ unlines $ map stem $ lines content
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment