Skip to content

Instantly share code, notes, and snippets.

@fizbin
Last active January 19, 2023 22:50
Show Gist options
  • Save fizbin/8267ff9dbc15c6564214aa4cea6d0cb0 to your computer and use it in GitHub Desktop.
Save fizbin/8267ff9dbc15c6564214aa4cea6d0cb0 to your computer and use it in GitHub Desktop.
Code to determine whether two regular expressions are equivalent, and if not to find a distinguishing string
{-# OPTIONS_GHC -Wall #-}
module Main (main) where
-- Takes two arguments in a limited regex-like language, and tells if
-- they are equivalent. This equivalence is shown by either saying
-- "there is no string which matches one regex but not the other" or
-- by giving a string which matches one but not the other.
-- (i.e. if the distinguishing string is Just "blah", then the regexes
-- aren't equivalent and onw matches "blah" and the other doesn't. If the
-- distinguishing string is Nothing, the regexes are equivalent)
-- Prints out:
-- First arg. parsed
-- Second arg. parsed
-- Result of trying to find distinguishing string assuming only "abcde"
-- is the regex alphabet
-- Result of trying to find distinguishing string assuming all of Char
-- is the regex alphabet
-- The language accepted is basically standard egrep regex syntax if the only
-- metacharacters were . * | ( ) \
-- plus then \N means Null (never match), \E means "everything" (aka .*)
-- Also, add & as a metacharacter to mean "And"
-- add ! as a postfix operator to mean "Not"
-- Note: no '[abc]' support. Use "(a|b|c)"; for [^abc] use "(.&(a|b|c)!)"
-- This is based on the concept of a "regular expression derivative"; the
-- derivative of a regular expression "r" with respect to a string "s" is
-- a regular expression "u" such that any string "t" matches "u" if and only if
-- the string "s+t" matches "r". For example, the derivative of (cat(|erpillar))
-- with respect to the string "ca" is the regex (t(|erpillar)) and the derivative
-- of the same regex with respect to "cate" is the regex (rpillar). The
-- derivative of a regex with respect to a given character is the derivative with
-- respect to the length-one string of that character.
import Control.Applicative (Alternative ((<|>)))
import Control.Monad (join)
import Data.Bifunctor (Bifunctor (bimap))
import Data.Foldable (find)
import qualified Data.Map.Merge.Strict as M
import qualified Data.Map.Strict as M
import qualified Data.Set as S
import System.Environment (getArgs)
-- import Debug.Trace
data Regex alphabet
= Null -- Never matches
| Everything -- Always matches (aka .*)
| Epsilon -- Matches empty string only
| Any -- Any single char; used for "."
| C !alphabet -- One specific char
| Star !(Regex alphabet)
| Not !(Regex alphabet)
| Seq !(Regex alphabet) !(Regex alphabet)
| Or !(Regex alphabet) !(Regex alphabet)
| And !(Regex alphabet) !(Regex alphabet)
deriving (Eq, Ord, Show, Read)
-- | Does the regex match the empty string?
nullable :: Regex a -> Bool
nullable Null = False
nullable Everything = True
nullable Epsilon = True
nullable Any = False
nullable (C _) = False
nullable (Star _) = True
nullable (Not x) = not (nullable x)
nullable (Seq a b) = nullable a && nullable b
nullable (Or a b) = nullable a || nullable b
nullable (And a b) = nullable a && nullable b
-- These mk* functions are "smart constructors"; they exist to
-- apply necessary canonicalization rules to guarantee that a
-- given regex has only a finite number of derivatives.
-- For the three constructors that take two regexes (Seq, Or, And), we make
-- sure that when there are three or more expressions joined with the same
-- constructor that the tree is always strictly right-heavy; i.e. that it's
-- like (Seq a (Seq b (Seq c (Seq d (Seq e))))), where "a", "b", "c", and "d"
-- don't start with "Seq". For "Or" and "And", we also sort the arguments.
mkNot :: Regex a -> Regex a
mkNot (Not x) = x
mkNot Null = Everything
mkNot Everything = Null
mkNot x = Not x
mkStar :: Regex a -> Regex a
mkStar Null = Epsilon
mkStar Everything = Everything
mkStar Any = Everything
mkStar Epsilon = Epsilon
mkStar (Or Epsilon x) = mkStar x
mkStar (Or Any _) = Everything
mkStar y@(Star _) = y
mkStar x = Star x
mkSeq :: Regex a -> Regex a -> Regex a
mkSeq Null _ = Null
mkSeq _ Null = Null
mkSeq Epsilon x = x
mkSeq x Epsilon = x
mkSeq (Seq x y) z = mkSeq x (mkSeq y z)
mkSeq x y = Seq x y
mkOr :: (Ord a) => Regex a -> Regex a -> Regex a
mkOr Null x = x
mkOr Everything _ = Everything
mkOr (Or x y) z = mkOr x (mkOr y z)
mkOr x y'@(Or y z) = case compare x y of
GT -> mkOr y (mkOr x z)
EQ -> y'
LT -> Or x y'
mkOr x y = case compare x y of
GT -> mkOr y x
EQ -> x
LT -> Or x y
mkAnd :: (Ord a) => Regex a -> Regex a -> Regex a
mkAnd Null _ = Null
mkAnd Everything x = x
mkAnd (And x y) z = mkAnd x (mkAnd y z)
mkAnd x y'@(And y z) = case compare x y of
GT -> mkAnd y (mkAnd x z)
EQ -> y'
LT -> And x y'
mkAnd x y = case compare x y of
GT -> mkAnd y x
EQ -> x
LT -> And x y
simplify :: Ord a => Regex a -> Regex a
simplify (Or x y) = mkOr (simplify x) (simplify y)
simplify (And x y) = mkAnd (simplify x) (simplify y)
simplify (Seq x y) = mkSeq (simplify x) (simplify y)
simplify (Star x) = mkStar (simplify x)
simplify (Not x) = mkNot (simplify x)
simplify x = x
-- Union of two maps with a merging function, using default1 when there's
-- no value in map1 and default2 when there's no value in map2
mergeWithDefault ::
Ord k =>
(a -> b -> c) ->
a ->
b ->
M.Map k a ->
M.Map k b ->
M.Map k c
mergeWithDefault mergeFn default1 default2 =
M.merge
(M.mapMissing $ const (`mergeFn` default2))
(M.mapMissing $ const (default1 `mergeFn`))
(M.zipWithMatched $ const mergeFn)
-- two function combiners that are useful:
(***) :: (a -> b) -> (c -> d) -> (a, c) -> (b, d)
(***) = bimap
(****) :: (a -> b1 -> b2) -> (c -> d1 -> d2) -> (a, c) -> (b1, d1) -> (b2, d2)
f **** g = \(a1, a2) (b1, b2) -> (f a1 b1, g a2 b2)
infixl 8 ***
infixl 8 ****
-- | Returns a derivative for "some other letter not in map",
-- and a map of letter to derivative rel. to that letter
derivatives :: (Ord a) => Regex a -> (Regex a, M.Map a (Regex a))
derivatives Any = (Epsilon, M.empty)
derivatives Epsilon = (Null, M.empty)
derivatives Null = (Null, M.empty)
derivatives Everything = (Everything, M.empty)
derivatives (C a) = (Null, M.singleton a Epsilon)
derivatives (Seq x y) =
let dxsy = (`mkSeq` y) *** M.map (`mkSeq` y) $ derivatives x
dy = derivatives y
in if nullable x
then (mkOr **** mergeWithDefault mkOr (fst dxsy) (fst dy)) dxsy dy
else dxsy
derivatives (Star x) =
(`mkSeq` Star x) *** M.map (`mkSeq` Star x) $ derivatives x
derivatives (Or x y) =
let dx = derivatives x
dy = derivatives y
in (mkOr **** mergeWithDefault mkOr (fst dx) (fst dy)) dx dy
derivatives (And x y) =
let dx = derivatives x
dy = derivatives y
in (mkAnd **** mergeWithDefault mkAnd (fst dx) (fst dy)) dx dy
derivatives (Not x) = mkNot *** M.map mkNot $ derivatives x
-- borrowed from the 'extra' package
zipWithLongest :: (Maybe a -> Maybe b -> c) -> [a] -> [b] -> [c]
zipWithLongest f l [] = (`f` Nothing) . Just <$> l
zipWithLongest f [] l = (Nothing `f`) . Just <$> l
zipWithLongest f (a : as) (b : bs) = f (Just a) (Just b) : zipWithLongest f as bs
findDistinguishingString ::
(Ord a, Show a) =>
-- | Function that finds a new character not in the given list
([a] -> Maybe a) ->
-- | Regex one
Regex a ->
-- | Regex two
Regex a ->
Maybe [a]
findDistinguishingString findNewChar reg1 reg2 =
let topWork = go S.empty reg1 reg2
in foldr (<|>) Nothing topWork
where
go seen r1 r2 | (r1, r2) `S.member` seen = []
go _ r1 r2 | r1 == r2 = []
go _ r1 r2 | nullable r1 /= nullable r2 = [Just []]
go seen r1 r2 =
let (dr1def, dr1map) = derivatives r1
(dr2def, dr2map) = derivatives r2
findrest res1 res2 =
Nothing : go ((r1, r2) `S.insert` seen) res1 res2
charxmp k res1 res2 = fmap (k :) <$> findrest res1 res2
defxmp = case findNewChar (M.keys dr1map ++ M.keys dr2map) of
Nothing -> []
Just xmpfst -> charxmp xmpfst dr1def dr2def
in foldr
(zipWithLongest (\a b -> join a <|> join b))
defxmp
( M.merge
(M.mapMissing $ \k x -> charxmp k x dr2def)
(M.mapMissing $ \k y -> charxmp k dr1def y)
(M.zipWithMatched charxmp)
dr1map
dr2map
)
findMyNewChar :: [Char] -> Maybe Char
findMyNewChar xs
| 'a' `notElem` xs = Just 'a'
| 'b' `notElem` xs = Just 'b'
| 'c' `notElem` xs = Just 'c'
| 'd' `notElem` xs = Just 'd'
| 'e' `notElem` xs = Just 'e'
| otherwise = Nothing
findMyNewChar' :: [Char] -> Maybe Char
findMyNewChar' [] = Just 'A'
findMyNewChar' (x : xs) =
let xmax = foldr max x xs
xset = S.fromList (x : xs)
in if xmax == maxBound
then find (`S.notMember` xset) [minBound .. maxBound]
else Just (succ xmax)
parse :: String -> Regex Char
parse str' = go [] [Epsilon] str'
where
unwind [] vals = vals
unwind ('&' : ops) (val1 : val2 : vals) = unwind ops (mkAnd val2 val1 : vals)
unwind ('|' : ops) (val1 : val2 : vals) = unwind ops (mkOr val2 val1 : vals)
unwind ('s' : ops) (val1 : val2 : vals) = unwind ops (mkSeq val2 val1 : vals)
unwind (c : _) _ = error ("Unknown op character " ++ [c])
go opstack valstack "" =
-- trace ("go: has " ++ show opstack ++ " " ++ show valstack) $
if 1 + length opstack /= length valstack
then error ("Error parsing " ++ str')
else head $ unwind opstack valstack
go opstack valstack ('\\' : 'N' : xs) =
go ('s' : opstack) (Null : valstack) xs
go opstack valstack ('\\' : 'E' : xs) =
go ('s' : opstack) (Everything : valstack) xs
go opstack valstack ('\\' : x : xs) =
go ('s' : opstack) (C x : valstack) xs
go opstack valstack ('(' : xs) =
go ('(' : opstack) (Epsilon : valstack) xs
go opstack valstack ('|' : xs) =
let (skipops, restops) = span (`elem` "&s") opstack
in go ('|' : restops) (Epsilon : unwind skipops valstack) xs
go opstack valstack ('&' : xs) =
let (skipops, restops) = span (`elem` "s") opstack
in go ('&' : restops) (Epsilon : unwind skipops valstack) xs
go opstack valstack (')' : xs) =
-- trace ("): has " ++ show opstack ++ " " ++ show valstack) $
let (myops, othops) = span (/= '(') opstack
in case othops of
[] -> error ("Extra closing paren in " ++ str')
(_ : rest) -> go ('s' : rest) (unwind myops valstack) xs
go opstack (v : vals) ('*' : xs) =
go opstack (mkStar v : vals) xs
go opstack (v : vals) ('!' : xs) =
go opstack (mkNot v : vals) xs
go opstack valstack ('.' : xs) =
go ('s' : opstack) (Any : valstack) xs
go opstack valstack (x : xs) =
go ('s' : opstack) (C x : valstack) xs
main :: IO ()
main = do
args' <- getArgs
let args = args' ++ [".", ".."]
args1 = head args
args2 = head $ tail args
re1 = simplify $ parse args1
re2 = simplify $ parse args2
putStrLn $ "First regex parsed: " ++ show re1
putStrLn $ "Second regex parsed: " ++ show re2
putStrLn $
"Distinguishing string if the alphabet is only ['a'..'e']: "
++ show (findDistinguishingString findMyNewChar re1 re2)
putStrLn $
"Distinguishing string if the alphabet is all of Char: "
++ show (findDistinguishingString findMyNewChar' re1 re2)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment