Last active
January 19, 2023 22:50
-
-
Save fizbin/8267ff9dbc15c6564214aa4cea6d0cb0 to your computer and use it in GitHub Desktop.
Code to determine whether two regular expressions are equivalent, and if not to find a distinguishing string
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{-# OPTIONS_GHC -Wall #-} | |
module Main (main) where | |
-- Takes two arguments in a limited regex-like language, and tells if | |
-- they are equivalent. This equivalence is shown by either saying | |
-- "there is no string which matches one regex but not the other" or | |
-- by giving a string which matches one but not the other. | |
-- (i.e. if the distinguishing string is Just "blah", then the regexes | |
-- aren't equivalent and onw matches "blah" and the other doesn't. If the | |
-- distinguishing string is Nothing, the regexes are equivalent) | |
-- Prints out: | |
-- First arg. parsed | |
-- Second arg. parsed | |
-- Result of trying to find distinguishing string assuming only "abcde" | |
-- is the regex alphabet | |
-- Result of trying to find distinguishing string assuming all of Char | |
-- is the regex alphabet | |
-- The language accepted is basically standard egrep regex syntax if the only | |
-- metacharacters were . * | ( ) \ | |
-- plus then \N means Null (never match), \E means "everything" (aka .*) | |
-- Also, add & as a metacharacter to mean "And" | |
-- add ! as a postfix operator to mean "Not" | |
-- Note: no '[abc]' support. Use "(a|b|c)"; for [^abc] use "(.&(a|b|c)!)" | |
-- This is based on the concept of a "regular expression derivative"; the | |
-- derivative of a regular expression "r" with respect to a string "s" is | |
-- a regular expression "u" such that any string "t" matches "u" if and only if | |
-- the string "s+t" matches "r". For example, the derivative of (cat(|erpillar)) | |
-- with respect to the string "ca" is the regex (t(|erpillar)) and the derivative | |
-- of the same regex with respect to "cate" is the regex (rpillar). The | |
-- derivative of a regex with respect to a given character is the derivative with | |
-- respect to the length-one string of that character. | |
import Control.Applicative (Alternative ((<|>))) | |
import Control.Monad (join) | |
import Data.Bifunctor (Bifunctor (bimap)) | |
import Data.Foldable (find) | |
import qualified Data.Map.Merge.Strict as M | |
import qualified Data.Map.Strict as M | |
import qualified Data.Set as S | |
import System.Environment (getArgs) | |
-- import Debug.Trace | |
data Regex alphabet | |
= Null -- Never matches | |
| Everything -- Always matches (aka .*) | |
| Epsilon -- Matches empty string only | |
| Any -- Any single char; used for "." | |
| C !alphabet -- One specific char | |
| Star !(Regex alphabet) | |
| Not !(Regex alphabet) | |
| Seq !(Regex alphabet) !(Regex alphabet) | |
| Or !(Regex alphabet) !(Regex alphabet) | |
| And !(Regex alphabet) !(Regex alphabet) | |
deriving (Eq, Ord, Show, Read) | |
-- | Does the regex match the empty string? | |
nullable :: Regex a -> Bool | |
nullable Null = False | |
nullable Everything = True | |
nullable Epsilon = True | |
nullable Any = False | |
nullable (C _) = False | |
nullable (Star _) = True | |
nullable (Not x) = not (nullable x) | |
nullable (Seq a b) = nullable a && nullable b | |
nullable (Or a b) = nullable a || nullable b | |
nullable (And a b) = nullable a && nullable b | |
-- These mk* functions are "smart constructors"; they exist to | |
-- apply necessary canonicalization rules to guarantee that a | |
-- given regex has only a finite number of derivatives. | |
-- For the three constructors that take two regexes (Seq, Or, And), we make | |
-- sure that when there are three or more expressions joined with the same | |
-- constructor that the tree is always strictly right-heavy; i.e. that it's | |
-- like (Seq a (Seq b (Seq c (Seq d (Seq e))))), where "a", "b", "c", and "d" | |
-- don't start with "Seq". For "Or" and "And", we also sort the arguments. | |
mkNot :: Regex a -> Regex a | |
mkNot (Not x) = x | |
mkNot Null = Everything | |
mkNot Everything = Null | |
mkNot x = Not x | |
mkStar :: Regex a -> Regex a | |
mkStar Null = Epsilon | |
mkStar Everything = Everything | |
mkStar Any = Everything | |
mkStar Epsilon = Epsilon | |
mkStar (Or Epsilon x) = mkStar x | |
mkStar (Or Any _) = Everything | |
mkStar y@(Star _) = y | |
mkStar x = Star x | |
mkSeq :: Regex a -> Regex a -> Regex a | |
mkSeq Null _ = Null | |
mkSeq _ Null = Null | |
mkSeq Epsilon x = x | |
mkSeq x Epsilon = x | |
mkSeq (Seq x y) z = mkSeq x (mkSeq y z) | |
mkSeq x y = Seq x y | |
mkOr :: (Ord a) => Regex a -> Regex a -> Regex a | |
mkOr Null x = x | |
mkOr Everything _ = Everything | |
mkOr (Or x y) z = mkOr x (mkOr y z) | |
mkOr x y'@(Or y z) = case compare x y of | |
GT -> mkOr y (mkOr x z) | |
EQ -> y' | |
LT -> Or x y' | |
mkOr x y = case compare x y of | |
GT -> mkOr y x | |
EQ -> x | |
LT -> Or x y | |
mkAnd :: (Ord a) => Regex a -> Regex a -> Regex a | |
mkAnd Null _ = Null | |
mkAnd Everything x = x | |
mkAnd (And x y) z = mkAnd x (mkAnd y z) | |
mkAnd x y'@(And y z) = case compare x y of | |
GT -> mkAnd y (mkAnd x z) | |
EQ -> y' | |
LT -> And x y' | |
mkAnd x y = case compare x y of | |
GT -> mkAnd y x | |
EQ -> x | |
LT -> And x y | |
simplify :: Ord a => Regex a -> Regex a | |
simplify (Or x y) = mkOr (simplify x) (simplify y) | |
simplify (And x y) = mkAnd (simplify x) (simplify y) | |
simplify (Seq x y) = mkSeq (simplify x) (simplify y) | |
simplify (Star x) = mkStar (simplify x) | |
simplify (Not x) = mkNot (simplify x) | |
simplify x = x | |
-- Union of two maps with a merging function, using default1 when there's | |
-- no value in map1 and default2 when there's no value in map2 | |
mergeWithDefault :: | |
Ord k => | |
(a -> b -> c) -> | |
a -> | |
b -> | |
M.Map k a -> | |
M.Map k b -> | |
M.Map k c | |
mergeWithDefault mergeFn default1 default2 = | |
M.merge | |
(M.mapMissing $ const (`mergeFn` default2)) | |
(M.mapMissing $ const (default1 `mergeFn`)) | |
(M.zipWithMatched $ const mergeFn) | |
-- two function combiners that are useful: | |
(***) :: (a -> b) -> (c -> d) -> (a, c) -> (b, d) | |
(***) = bimap | |
(****) :: (a -> b1 -> b2) -> (c -> d1 -> d2) -> (a, c) -> (b1, d1) -> (b2, d2) | |
f **** g = \(a1, a2) (b1, b2) -> (f a1 b1, g a2 b2) | |
infixl 8 *** | |
infixl 8 **** | |
-- | Returns a derivative for "some other letter not in map", | |
-- and a map of letter to derivative rel. to that letter | |
derivatives :: (Ord a) => Regex a -> (Regex a, M.Map a (Regex a)) | |
derivatives Any = (Epsilon, M.empty) | |
derivatives Epsilon = (Null, M.empty) | |
derivatives Null = (Null, M.empty) | |
derivatives Everything = (Everything, M.empty) | |
derivatives (C a) = (Null, M.singleton a Epsilon) | |
derivatives (Seq x y) = | |
let dxsy = (`mkSeq` y) *** M.map (`mkSeq` y) $ derivatives x | |
dy = derivatives y | |
in if nullable x | |
then (mkOr **** mergeWithDefault mkOr (fst dxsy) (fst dy)) dxsy dy | |
else dxsy | |
derivatives (Star x) = | |
(`mkSeq` Star x) *** M.map (`mkSeq` Star x) $ derivatives x | |
derivatives (Or x y) = | |
let dx = derivatives x | |
dy = derivatives y | |
in (mkOr **** mergeWithDefault mkOr (fst dx) (fst dy)) dx dy | |
derivatives (And x y) = | |
let dx = derivatives x | |
dy = derivatives y | |
in (mkAnd **** mergeWithDefault mkAnd (fst dx) (fst dy)) dx dy | |
derivatives (Not x) = mkNot *** M.map mkNot $ derivatives x | |
-- borrowed from the 'extra' package | |
zipWithLongest :: (Maybe a -> Maybe b -> c) -> [a] -> [b] -> [c] | |
zipWithLongest f l [] = (`f` Nothing) . Just <$> l | |
zipWithLongest f [] l = (Nothing `f`) . Just <$> l | |
zipWithLongest f (a : as) (b : bs) = f (Just a) (Just b) : zipWithLongest f as bs | |
findDistinguishingString :: | |
(Ord a, Show a) => | |
-- | Function that finds a new character not in the given list | |
([a] -> Maybe a) -> | |
-- | Regex one | |
Regex a -> | |
-- | Regex two | |
Regex a -> | |
Maybe [a] | |
findDistinguishingString findNewChar reg1 reg2 = | |
let topWork = go S.empty reg1 reg2 | |
in foldr (<|>) Nothing topWork | |
where | |
go seen r1 r2 | (r1, r2) `S.member` seen = [] | |
go _ r1 r2 | r1 == r2 = [] | |
go _ r1 r2 | nullable r1 /= nullable r2 = [Just []] | |
go seen r1 r2 = | |
let (dr1def, dr1map) = derivatives r1 | |
(dr2def, dr2map) = derivatives r2 | |
findrest res1 res2 = | |
Nothing : go ((r1, r2) `S.insert` seen) res1 res2 | |
charxmp k res1 res2 = fmap (k :) <$> findrest res1 res2 | |
defxmp = case findNewChar (M.keys dr1map ++ M.keys dr2map) of | |
Nothing -> [] | |
Just xmpfst -> charxmp xmpfst dr1def dr2def | |
in foldr | |
(zipWithLongest (\a b -> join a <|> join b)) | |
defxmp | |
( M.merge | |
(M.mapMissing $ \k x -> charxmp k x dr2def) | |
(M.mapMissing $ \k y -> charxmp k dr1def y) | |
(M.zipWithMatched charxmp) | |
dr1map | |
dr2map | |
) | |
findMyNewChar :: [Char] -> Maybe Char | |
findMyNewChar xs | |
| 'a' `notElem` xs = Just 'a' | |
| 'b' `notElem` xs = Just 'b' | |
| 'c' `notElem` xs = Just 'c' | |
| 'd' `notElem` xs = Just 'd' | |
| 'e' `notElem` xs = Just 'e' | |
| otherwise = Nothing | |
findMyNewChar' :: [Char] -> Maybe Char | |
findMyNewChar' [] = Just 'A' | |
findMyNewChar' (x : xs) = | |
let xmax = foldr max x xs | |
xset = S.fromList (x : xs) | |
in if xmax == maxBound | |
then find (`S.notMember` xset) [minBound .. maxBound] | |
else Just (succ xmax) | |
parse :: String -> Regex Char | |
parse str' = go [] [Epsilon] str' | |
where | |
unwind [] vals = vals | |
unwind ('&' : ops) (val1 : val2 : vals) = unwind ops (mkAnd val2 val1 : vals) | |
unwind ('|' : ops) (val1 : val2 : vals) = unwind ops (mkOr val2 val1 : vals) | |
unwind ('s' : ops) (val1 : val2 : vals) = unwind ops (mkSeq val2 val1 : vals) | |
unwind (c : _) _ = error ("Unknown op character " ++ [c]) | |
go opstack valstack "" = | |
-- trace ("go: has " ++ show opstack ++ " " ++ show valstack) $ | |
if 1 + length opstack /= length valstack | |
then error ("Error parsing " ++ str') | |
else head $ unwind opstack valstack | |
go opstack valstack ('\\' : 'N' : xs) = | |
go ('s' : opstack) (Null : valstack) xs | |
go opstack valstack ('\\' : 'E' : xs) = | |
go ('s' : opstack) (Everything : valstack) xs | |
go opstack valstack ('\\' : x : xs) = | |
go ('s' : opstack) (C x : valstack) xs | |
go opstack valstack ('(' : xs) = | |
go ('(' : opstack) (Epsilon : valstack) xs | |
go opstack valstack ('|' : xs) = | |
let (skipops, restops) = span (`elem` "&s") opstack | |
in go ('|' : restops) (Epsilon : unwind skipops valstack) xs | |
go opstack valstack ('&' : xs) = | |
let (skipops, restops) = span (`elem` "s") opstack | |
in go ('&' : restops) (Epsilon : unwind skipops valstack) xs | |
go opstack valstack (')' : xs) = | |
-- trace ("): has " ++ show opstack ++ " " ++ show valstack) $ | |
let (myops, othops) = span (/= '(') opstack | |
in case othops of | |
[] -> error ("Extra closing paren in " ++ str') | |
(_ : rest) -> go ('s' : rest) (unwind myops valstack) xs | |
go opstack (v : vals) ('*' : xs) = | |
go opstack (mkStar v : vals) xs | |
go opstack (v : vals) ('!' : xs) = | |
go opstack (mkNot v : vals) xs | |
go opstack valstack ('.' : xs) = | |
go ('s' : opstack) (Any : valstack) xs | |
go opstack valstack (x : xs) = | |
go ('s' : opstack) (C x : valstack) xs | |
main :: IO () | |
main = do | |
args' <- getArgs | |
let args = args' ++ [".", ".."] | |
args1 = head args | |
args2 = head $ tail args | |
re1 = simplify $ parse args1 | |
re2 = simplify $ parse args2 | |
putStrLn $ "First regex parsed: " ++ show re1 | |
putStrLn $ "Second regex parsed: " ++ show re2 | |
putStrLn $ | |
"Distinguishing string if the alphabet is only ['a'..'e']: " | |
++ show (findDistinguishingString findMyNewChar re1 re2) | |
putStrLn $ | |
"Distinguishing string if the alphabet is all of Char: " | |
++ show (findDistinguishingString findMyNewChar' re1 re2) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment