Last active
January 7, 2024 18:23
-
-
Save sgronblo/e3d73a61c5dd968b7d29 to your computer and use it in GitHub Desktop.
Roman numeral parser in Haskell using Parsec
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env stack | |
-- stack --resolver lts-7.9 --install-ghc script --package QuickCheck --package either --package parsec | |
module Roman where | |
import Control.Applicative hiding ((<|>)) | |
import Data.Either.Combinators | |
import Text.Parsec.Prim | |
import Text.Parsec.Combinator | |
import Text.Parsec.Error | |
import Text.Parsec.Char | |
import Text.Parsec.String | |
import Test.QuickCheck | |
-- Parses a roman numeral based on base low mid top, | |
-- eg genericRomanize 809 100 'C' 'D', 'M' -> "DCCC" | |
genericRomanize :: Int -> Int -> Char -> Char -> Char -> String | |
genericRomanize n base low mid top | |
| n >= 9 * base = [low, top] | |
| n >= 5 * base = mid : replicate ((n - 5 * base) `div` base) low | |
| n >= 4 * base = [low, mid] | |
| otherwise = replicate (n `div` base) low | |
romanize :: Int -> String | |
romanize n | |
| n >= 1000 = replicate (n `div` 1000) 'M' ++ romanize (n `mod` 1000) | |
| n >= 100 = genericRomanize n 100 'C' 'D' 'M' ++ romanize (n `mod` 100) | |
| n >= 10 = genericRomanize n 10 'X' 'L' 'C' ++ romanize (n `mod` 10) | |
| n >= 1 = genericRomanize n 1 'I' 'V' 'X' | |
| n == 0 = "" | |
| otherwise = error $ "invalid input to romanize " ++ show n ++ " must be non-negative" | |
type IntegerParser = Parser Int | |
-- The full parser for parsing an arabic numeral from a roman | |
parseRoman :: IntegerParser | |
parseRoman = do | |
thousand <- manyUpToN (char 'M') 3 >>= \s -> return $ length s * 1000 | |
hundred <- genericRomanParse 100 'C' 'D' 'M' | |
ten <- genericRomanParse 10 'X' 'L' 'C' | |
one <- genericRomanParse 1 'I' 'V' 'X' | |
eof | |
return $ thousand + hundred + ten + one | |
-- Combinator that matches {0,n} times in regexp terminology | |
manyUpToN :: ParsecT s u m a -> Int -> ParsecT s u m [a] | |
manyUpToN p n | |
| n <= 0 = pure [] | |
| otherwise = liftA2 (:) p (manyUpToN p (n - 1)) <|> pure [] | |
-- Parses a roman part from an arabic numeral | |
-- eg. parsing "DCCCIX" with genericRomanParse 100 'C' 'D' 'M' would parse | |
-- out 800 and leave "IX" unparsed | |
genericRomanParse :: Int -> Char -> Char -> Char -> IntegerParser | |
genericRomanParse base low mid top = try oneUnderTop <|> try overMid <|> try oneUnderMid <|> try overLow | |
where oneUnderTop = string [low, top] >> return (9 * base) | |
overMid = char mid >> manyUpToN (char low) 3 >>= (\s -> return $ (length s + 5) * base) | |
oneUnderMid = string [low, mid] >> return (4 * base) | |
overLow = manyUpToN (char low) 3 >>= (\s -> return $ length s * base) | |
arabize :: String -> Either ParseError Int | |
arabize = parse parseRoman "" | |
-- Property that checks that an arabic numeral turned into a roman numeral | |
-- turned back into an arabic numeral is the same as the original numeral. | |
-- I don't know if the Romans never needed to count higher than 3999... | |
romanizeInversal :: Property | |
romanizeInversal = forAll (choose (1, 3999)) compareRomanToArabic | |
where compareRomanToArabic n = printTestCase ceString (n == twiceConverted) | |
where romanNumeral = romanize n | |
twiceConverted = fromRight 0 (arabize romanNumeral) | |
ceString = show n ++ " -> " ++ romanNumeral ++ " -> " ++ show twiceConverted | |
main :: IO () | |
main = quickCheck romanizeInversal |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment