Created
January 30, 2020 14:39
-
-
Save gelisam/fec2fc2b1c7d77b9abd850c0e1682bfd to your computer and use it in GitHub Desktop.
A Haskell reimplementation of Scala's "direct pattern-matching on strings"
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
-- in response to https://twitter.com/chrislpenner/status/1221784005156036608 | |
-- | |
-- The goal is to mimic this Scala code, but in Haskell: | |
-- | |
-- > "spotify:user:123:playlist:456" match { | |
-- > case s"spotify:user:$userId:playlist:$playlistId" | |
-- > => ($userId, $playlistId) // ("123", "456") | |
-- > } | |
{-# LANGUAGE DeriveFunctor, LambdaCase, PatternSynonyms, QuasiQuotes, RankNTypes, TemplateHaskell, TypeOperators, ViewPatterns #-} | |
{-# OPTIONS -Wno-name-shadowing #-} | |
import Test.DocTest | |
import Control.Category ((>>>)) | |
import Control.Lens hiding ((:<), (:>)) | |
import Control.Monad (guard) | |
import Data.Char (isDigit, isLetter) | |
import Language.Haskell.TH | |
import Language.Haskell.TH.Quote | |
-- $setup | |
-- >>> :set -XQuasiQuotes | |
-- >>> :set -XViewPatterns | |
-- The other replies mentioned pattern synonyms and quasi-quotes. Both work, but | |
-- in very different ways. | |
-- | |
-- First, let's implement some helper functions which we'll be using in both | |
-- approaches. | |
-- | | |
-- A generalization of 'break' which looks at the entire remainder of the | |
-- string, not just the next character. | |
-- | |
-- >>> splitWhen ((== 2) . length) "foobar" | |
-- Just ("foob","ar") | |
-- >>> splitWhen (\xs -> xs == reverse xs) "yolo" | |
-- Just ("y","olo") | |
splitWhen :: (String -> Bool) | |
-> String -> Maybe (String, String) | |
splitWhen p xs | p xs | |
= Just ("", xs) | |
splitWhen _ [] | |
= Nothing | |
splitWhen p (x:xs) | |
= splitWhen p xs | |
& _Just . _1 %~ (x:) | |
-- | | |
-- >>> splitOn "baz" "foobarbazquux" | |
-- Just ("foobar","quux") | |
splitOn :: String -> String -> Maybe (String, String) | |
splitOn delim | |
= splitWhen beginsWithDelim | |
>>> _Just . _2 %~ stripDelim | |
where | |
n = length delim | |
beginsWithDelim s = take n s == delim | |
stripDelim = drop n | |
-- All right, let's begin with the pattern-synonym approach, since it's the | |
-- simplest. | |
-- | |
-- The input string and the pattern happen to both use colons as delimiters. We | |
-- can thus use an infix pattern synonym which looks like a colon, and then like | |
-- in the Scala code, our pattern will look a lot like the string we want to | |
-- match. | |
-- | |
-- Ideally, I'd like to use the name @(:)@, so I can write this: | |
-- | |
-- > case "spotify:user:123:playlist:456" of | |
-- > "spotify":"user":userId:"playlist":playlistId | |
-- > -> Just (userId, playlistId) | |
-- > _ -> Nothing | |
-- | |
-- @"spotify":"user":userId:"playlist":playlistId@ has a few more double-quotes | |
-- than @s"spotify:user:$userId:playlist:$playlistId"@, but it's pretty close. | |
-- | |
-- Unfortunately, while infix pattern synonyms coincidentally _must_ begin with | |
-- a colon, we cannot use @(:)@ because it's builtin syntax. And so is @(::)@. | |
-- So I guess we have to use @(:::)@. | |
-- | | |
-- >>> :{ | |
-- case "spotify:user:123:playlist:456" of | |
-- "spotify":::"user":::userId:::"playlist":::playlistId | |
-- -> Just (userId, playlistId) | |
-- _ -> Nothing | |
-- :} | |
-- Just ("123","456") | |
pattern (:::) :: String -> String -> String | |
pattern (:::) x y <- (splitOn ":" -> Just (x, y)) | |
infixr 0 ::: | |
-- | | |
-- Now, while this trick allowed us to pattern-match on this particular input | |
-- string, the original tweet said that "Scala supports direct pattern matching | |
-- on strings", and what we've implemented is not at all the same feature. Most | |
-- obviously, the colons and the variables being bound are outside of the | |
-- string, and also there are several strings; but that's just syntax. More | |
-- importantly, there are limitations: | |
-- | |
-- 1. If we wanted to use a different separator, we'd need to implement another | |
-- pattern synonym, whereas in Scala, we'd simply write a different separator | |
-- inside the string. | |
-- 2. Doesn't work if the left-hand side is a nested @(:::)@ pattern (hence the | |
-- infixr declaration), or a string containing a colon, or a pattern variable | |
-- which is expected to match a part of the string which contains a colon. | |
-- | |
-- This last limitation seems benign since we're using colons to delimit the | |
-- entries and thus are unlikely to want a variable to be bound to a string | |
-- which contains a colon. However, consider the following example: | |
-- | |
-- > "<p>I'm feeling <b>bold</b> today!</p>" match { | |
-- > case s"<p>$x</p>" => $x // "I'm feeling <b>bold</b> today!" | |
-- > } | |
-- | |
-- While we can easily implement pattern synonyms for '<' and '>' which behave | |
-- the same way as the one for ':', this time it's clear that we do want the | |
-- variable to be bound to a string which contains some of those special | |
-- characters, as long as it does not include "</p>". I could not find an online | |
-- Scala evaluator which supported these @s""@ patterns, so I don't know whether | |
-- Scala's implementation works in the above case, but ours definitely doesn't: | |
-- | |
-- >>> :{ | |
-- case "<p>simple case works</p>" of | |
-- "":<"p":>x:<"/p":>"" | |
-- -> Just x | |
-- _ -> Nothing | |
-- :} | |
-- Just "simple case works" | |
-- | |
-- >>> :{ | |
-- case "<p>complex case does <i>not</i>!</p>" of | |
-- "":<"p":>x:<"/p":>"" | |
-- -> Just x | |
-- _ -> Nothing | |
-- :} | |
-- Nothing | |
pattern (:<) :: String -> String -> String | |
pattern (:<) x y <- (splitOn "<" -> Just (x, y)) | |
infixr 0 :< | |
pattern (:>) :: String -> String -> String | |
pattern (:>) x y <- (splitOn ">" -> Just (x, y)) | |
infixr 0 :> | |
-- Let's go back to limitation 1, the fact that we just had to define two more | |
-- pattern synonyms which are exactly the same as the first, except with a | |
-- different string. Can't we parameterize our pattern synonym implementation, | |
-- by adding an extra String argument, like this? | |
-- | |
-- > pattern (:?) :: String -> String -> String -> String | |
-- > pattern (:?) delim x y <- (splitOn delim -> Just (x, y)) | |
-- > infixr 0 :? | |
-- | |
-- No, we can't, because those Strings aren't values we can pattern-match on | |
-- inside the definition of @(:?)@, they're pattern variables which we must bind | |
-- on the right-hand side of the @<-@. Alas, pattern synonyms are not first | |
-- class in Hakell, we cannot write a function which accepts or returns a | |
-- pattern synonym. | |
-- | | |
-- ...but we do have the next best thing, (affine) Folds! They're first class, | |
-- they represent a way to check whether a sub-thing is inside a bigger thing, | |
-- they're basically first-class patterns. | |
-- | |
-- >>> :{ | |
-- case "spotify:user:123:playlist:456" of | |
-- (preview (splittedOn ":") -> Just ("spotify", | |
-- preview (splittedOn ":") -> Just ("user", | |
-- preview (splittedOn ":") -> Just (userId, | |
-- preview (splittedOn ":") -> Just ("playlist", playlistId))))) | |
-- -> Just (userId, playlistId) | |
-- _ -> Nothing | |
-- :} | |
-- Just ("123","456") | |
splittedOn :: String -> Fold String (String, String) | |
splittedOn delim = to (splitOn delim) . _Just | |
-- | | |
-- That worked, but the syntax is horribly verbose. In fact, since we had to use | |
-- ViewPatterns in order to convince ghc to let use use our first-class patterns | |
-- inside an actual pattern, we might as well have used 'splitOn' directly: | |
-- | |
-- >>> :{ | |
-- case "spotify:user:123:playlist:456" of | |
-- (splitOn ":" -> Just ("spotify", | |
-- splitOn ":" -> Just ("user", | |
-- splitOn ":" -> Just (userId, | |
-- splitOn ":" -> Just ("playlist", playlistId))))) | |
-- -> Just (userId, playlistId) | |
-- _ -> Nothing | |
-- :} | |
-- Just ("123","456") | |
-- | |
-- If we want a nice syntax, we're going to have to use the quasi-quote | |
-- approach. | |
-- The input of the quasi-quote is a string like "foo:$x:bar:$y:baz" containing | |
-- both string literals and variable names. So let's begin by parsing that | |
-- string into those pieces. | |
-- | | |
-- Haskell98 says "An identifier consists of a letter followed by zero or more | |
-- letters, digits, underscores, and single quotes." | |
-- | |
-- >>> span isIdentifierChar "foo:bar" | |
-- ("foo",":bar") | |
isIdentifierChar :: Char -> Bool | |
isIdentifierChar c | |
= isLetter c | |
|| isDigit c | |
|| (c == '_') | |
|| (c == '\'') | |
data Piece a = Lit String | Var a | |
deriving (Functor, Show) | |
makePrisms ''Piece | |
-- | | |
-- >>> splitIntoPieces "foo:$x:bar:$y:baz:$z" | |
-- [Lit "foo:",Var "x",Lit ":bar:",Var "y",Lit ":baz:",Var "z"] | |
splitIntoPieces :: String -> [Piece String] | |
splitIntoPieces "" | |
= [] | |
splitIntoPieces ('$':xs) | |
= Var var : splitIntoPieces xs' | |
where | |
(var, xs') = span isIdentifierChar xs | |
splitIntoPieces (x:xs) | |
= case splitIntoPieces xs of | |
Lit s : pieces | |
-> Lit (x:s) : pieces | |
pieces | |
-> Lit [x] : pieces | |
-- Next, let's figure out what we want to output. When the user defines | |
-- | |
-- > example1 :: String -> String | |
-- > example1 [s|foo:$x:bar:$y:baz:$z|] | |
-- > = x ++ ":" ++ y ++ ":" ++ z | |
-- > example1 _ | |
-- > = "no match" | |
-- | |
-- we want to generate the following code. | |
-- | | |
-- >>> example1 "foo:my-foo:bar:my-bar:baz:my-baz" | |
-- "my-foo:my-bar:my-baz" | |
example1 :: String -> String | |
example1 | |
( (\s -> do | |
("", s) <- splitOn "foo:" s | |
(x, s) <- splitOn ":bar:" s | |
(y, s) <- splitOn ":baz:" s | |
let z = s | |
pure (x, y, z)) | |
-> Just (x, y, z)) | |
= x ++ ":" ++ y ++ ":" ++ z | |
example1 _ | |
= "no match" | |
-- We had to do something special both at the beginning and at the end. That's | |
-- because our pattern string began with a literal and ended with a variable. | |
-- Here's what an input string which does the opposite would look like. | |
-- | |
-- > example2 :: String -> String | |
-- > example2 [s|$x:foo:$y:bar:$z:baz|] | |
-- > = x ++ ":" ++ y ++ ":" ++ z | |
-- > example2 _ | |
-- > = "no match" | |
-- | | |
-- >>> example2 "my-foo:foo:my-bar:bar:my-baz:baz" | |
-- "my-foo:my-bar:my-baz" | |
example2 :: String -> String | |
example2 | |
( (\s -> do | |
(x, s) <- splitOn ":foo:" s | |
(y, s) <- splitOn ":bar:" s | |
(z, s) <- splitOn ":baz" s | |
guard (s == "") | |
pure (x, y, z)) | |
-> Just (x, y, z)) | |
= x ++ ":" ++ y ++ ":" ++ z | |
example2 _ | |
= "no match" | |
-- And now we just have to write the TemplateHaskell code which will generate | |
-- the above. There really isn't much to say about the implementation, I am | |
-- simply pattern-matching on the pieces and generating code according to the | |
-- above two examples. | |
s :: QuasiQuoter | |
s = QuasiQuoter | |
{ quoteExp = notAllowed | |
, quotePat = generatePat | |
, quoteType = notAllowed | |
, quoteDec = notAllowed | |
} | |
where | |
notAllowed :: String -> Q a | |
notAllowed _ = do | |
error "this quasi-quoter is only allowed in patterns" | |
generatePat :: String -> Q Pat | |
generatePat s = viewPat | |
where | |
pieces :: [Piece Name] | |
pieces = splitIntoPieces s | |
& each . _Var %~ mkName | |
varNames :: [Name] | |
varNames = pieces ^.. each . _Var | |
-- (x, y, z) | |
varsE :: Q Exp | |
varsE = tupE (map varE varNames) | |
-- (x, y, z) | |
varsP :: Q Pat | |
varsP = tupP (map varP varNames) | |
-- ((\s -> ...) -> Just (x, y, z)) | |
viewPat :: Q Pat | |
viewPat = [p|($(fun) -> Just $(varsP))|] | |
-- \s -> ... | |
fun :: Q Exp | |
fun = do | |
s <- newName "s" | |
[|\ $(varP s) -> $(body s pieces)|] | |
body :: Name -> [Piece Name] -> Q Exp | |
body s = \case | |
[] -> do | |
-- guard (s == "") | |
-- pure (x, y, z) | |
[|do guard ($(varE s) == "") | |
pure $(varsE)|] | |
Lit lit : [] -> do | |
-- guard (s == lit) | |
-- pure (x, y, z) | |
[|do guard ($(varE s) == $(litE (stringL lit))) | |
pure $(varsE)|] | |
Lit lit : ps -> do | |
-- ("", s) <- splitOn lit s | |
[|splitOn $(litE (stringL lit)) | |
$(varE s) | |
>>= \("", $(varP s)) -> $(body s ps)|] | |
Var var : [] -> do | |
-- let var = s | |
-- pure (x, y, z) | |
[|do let $(varP var) = $(varE s) | |
pure $(varsE)|] | |
Var var : Lit lit : ps -> do | |
-- (var, s) <- splitOn lit s | |
[|splitOn $(litE (stringL lit)) | |
$(varE s) | |
>>= \($(varP var), $(varP s)) -> $(body s ps)|] | |
Var var1 : Var var2 : ps -> do | |
-- let var2 = "" | |
[|let $(varP var2) = "" | |
in $(body s (Var var1 : ps))|] | |
-- | | |
-- Finally, let's check that our implementation works with all of our examples | |
-- so far: | |
-- | |
-- >>> :{ | |
-- case "spotify:user:123:playlist:456" of | |
-- [s|spotify:user:$user:playlist:$playlist|] | |
-- -> Just (user, playlist) | |
-- _ -> Nothing | |
-- :} | |
-- Just ("123","456") | |
-- | |
-- >>> :{ | |
-- case "<p>I'm feeling <b>bold</b> today!</p>" of | |
-- [s|<p>$x</p>|] | |
-- -> Just x | |
-- _ -> Nothing | |
-- :} | |
-- Just "I'm feeling <b>bold</b> today!" | |
-- | |
-- >>> :{ | |
-- case "foo:my-foo:bar:my-bar:baz:my-baz" of | |
-- [s|foo:$x:bar:$y:baz:$z|] | |
-- -> x ++ ":" ++ y ++ ":" ++ z | |
-- _ -> "no match" | |
-- :} | |
-- "my-foo:my-bar:my-baz" | |
-- | |
-- >>> :{ | |
-- case "my-foo:foo:my-bar:bar:my-baz:baz" of | |
-- [s|$x:foo:$y:bar:$z:baz|] | |
-- -> x ++ ":" ++ y ++ ":" ++ z | |
-- _ -> "no match" | |
-- :} | |
-- "my-foo:my-bar:my-baz" | |
-- | |
-- And that it does something sensible in corner cases: | |
-- | |
-- case "spotify:user:123:playlist:456" of | |
-- [s|spotify:user:$user$x:playlist:$playlist$y$z|] | |
-- -> Just (user, x, playlist, y) | |
-- _ -> Nothing | |
-- :} | |
-- Just ("123","","456","","") | |
-- | |
-- >>> :{ | |
-- case "singleString" of | |
-- [s|singleString|] | |
-- -> Just () | |
-- _ -> Nothing | |
-- :} | |
-- Just () | |
-- | |
-- >>> :{ | |
-- case "my-var" of | |
-- [s|$singleVar|] | |
-- -> Just singleVar | |
-- _ -> Nothing | |
-- :} | |
-- Just "my-var" | |
-- Oh, one last thing! | |
-- | |
-- Notice that this is a gist (only one file), and yet I'm using TemplateHaskell | |
-- and QuasiQuotes! Usually, GHC's stage restriction would require us to write | |
-- this in two files: one which defines the quasi-quote, and one which uses it. | |
-- | |
-- I can get around this limitation by using doctest: the doctests are comments | |
-- and so I'm not really using the quasi-quotes at all in this file. But you can | |
-- still run them to validate that the output I wrote is the real output. | |
main :: IO () | |
main = doctest ["StringPattern.hs"] |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment