Last active
June 19, 2021 16:36
-
-
Save AmadorMunozBerzosa/3288771aaa199cc73acc7781bff4d49e to your computer and use it in GitHub Desktop.
DSL wrapper for working with regular expressions in a readable manner in F#. Gist messes with the file ordering, but the correct file order is: Types > Evaluation > Operators > ActivePatterns > Examples
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
module Krow.Regex.ActivePatterns | |
open System.Text | |
let (|Regex|_|) (pattern:IRegex) input = | |
if input = null then | |
None | |
else | |
try | |
let match' = RegularExpressions.Regex.Match(input, pattern |> Regex.evaluate) | |
if match'.Success then | |
Some( List.tail [ for groups in match'.Groups -> groups.Value ]) | |
else | |
None | |
with _ -> None | |
let (|Regexs|) (pattern:IRegex) input = | |
if input = null then [] else | |
try | |
let matches = RegularExpressions.Regex.Matches(input, pattern |> Regex.evaluate) | |
[ for match' in matches do (List.tail [ for group in match'.Groups -> group.Value ]) ] | |
with e -> [] |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
[<AutoOpen>] | |
module Krow.Regex.Evaluation | |
open Krow.Regex.Types | |
open System.Text | |
module Regex = | |
let escape s = (RegularExpressions.Regex.Escape s).Replace("]", "\]") | |
let unescape (s:string) = (RegularExpressions.Regex.Unescape (s.Replace("\]", "]"))) | |
[<AutoOpen>] | |
module private Helpers = | |
let groupable (regex:IRegex) = | |
match regex with | |
| :? Regex.Sequence | :? Regex.OneOf -> Regex.NonCapturing regex :> IRegex | |
| _ -> regex | |
let listGroupable (regex:IRegex) = | |
match regex with | |
| :? Regex.OneOf -> Regex.NonCapturing regex :> IRegex | |
| _ -> regex | |
let rec charsetContent charSet = | |
match charSet with | |
| CharSet.OneOf chars -> | |
let string = (new string(chars |> List.toArray)).Replace("/",@"\/") | |
$@"{string}" | |
| CharSet.Range (first,last) -> | |
$@"{first}-{last}" | |
| CharSet.Multiple charSets -> | |
charSets |> List.map charsetContent |> List.reduce (+) | |
let referenceString reference = | |
match reference with | |
| Group.Positional number -> number |> string | |
| Group.Named string -> string | |
|> escape | |
let rec evaluate (regex:IRegex) = | |
match regex with | |
| :? Regex.Literal as literal -> | |
let (Regex.Literal literal) = literal | |
escape literal | |
| :? Regex.Raw as literal -> | |
let (Regex.Raw literal) = literal | |
literal | |
| :? Regex.OneOf as oneOf -> | |
let (Regex.OneOf regexs) = oneOf | |
regexs | |
|> List.map evaluate |> String.concat "|" | |
| :? Regex.Sequence as sequence -> | |
let (Regex.Sequence regexs) = sequence | |
regexs |> List.map (listGroupable >> evaluate) |> String.concat "" | |
| :? Regex.NonCapturing as grouping -> | |
let (Regex.NonCapturing regex) = grouping | |
$@"(?:{evaluate regex})" | |
| :? Mode.WithModes as withModes -> | |
let (Mode.WithModes (modes, regex)) = withModes | |
let modeChar = function | |
| Mode.CaseInsensitive -> "i" | |
| Mode.Multiline -> "m" | |
| Mode.ExplicitCapture -> "n" | |
| Mode.IgnoreUnescapedWhiteSpace -> "x" | |
let modeList = modes |> List.map modeChar |> String.concat "" | |
$@"(?{modeList}:{regex})" | |
| :? Group.Reference as reference -> | |
let string = reference |> referenceString | |
match reference with | |
| Group.Positional _ -> $@"\{string}" | |
| Group.Named _ -> $@"\k<{string}>" | |
| :? Look.Look as look -> | |
match look with | |
| Look.Ahead regex -> $@"(?={regex |> evaluate})" | |
| Look.Behind regex -> $@"(?<={regex |> evaluate})" | |
| :? Look.Negated as look -> | |
let (Look.Negated look) = look | |
match look with | |
| Look.Ahead regex -> $@"(?!{regex |> evaluate})" | |
| Look.Behind regex -> $@"(?<!{regex |> evaluate})" | |
| :? Group.Group as group -> | |
let (Group.Group(group,regex)) = group | |
let regex = regex |> evaluate | |
match group with | |
// Capturing | |
| Group.Capturing -> $@"({regex})" | |
| Group.CapturingWithName name -> $@"(?<{name}>{regex})" | |
// Non capturing | |
| Group.NonBacktrackingGrouping -> $@"(?>{regex})" | |
// Balancing | |
| Group.UnCapturing reference -> | |
$@"(?<-{reference |> referenceString}>{regex})" | |
| Group.Balancing (newName, reference) -> | |
$@"(?<{newName |> escape}-{reference |> referenceString}>{regex})" | |
| :? SpecialChar.SpecialChar as special -> | |
match special with | |
| SpecialChar.WildCard -> @"." | |
| SpecialChar.Bell -> @"\a" | |
| SpecialChar.Backspace -> @"\b" | |
| SpecialChar.Tab -> @"\t" | |
| SpecialChar.VerticalTab -> @"\v" | |
| SpecialChar.CarriageReturn -> @"\r" | |
| SpecialChar.NewLine -> @"\n" | |
| SpecialChar.Escaped -> @"\e" | |
| SpecialChar.Octal oct -> $@"\{oct}" | |
| SpecialChar.Hexadecimal hex -> $@"\x{hex}" | |
| SpecialChar.ASCII ascii -> $@"\u{ascii}" | |
| :? Anchor.Anchor as anchor -> | |
match anchor with | |
| Anchor.Start -> @"\A" | |
| Anchor.StartOfLine -> @"^" | |
| Anchor.End -> @"\z" | |
| Anchor.EndOfLine -> @"$" | |
| Anchor.Boundary -> @"\b" | |
| Anchor.NotBoundary -> @"\B" | |
| Anchor.AfterMatch -> @"\G" | |
| :? CharSet.CharSet as charSet -> | |
$"[{charsetContent charSet}]" | |
| :? CharSet.Negated as negated -> | |
let (CharSet.Negated charSet) = negated | |
$"[^{charsetContent charSet}]" | |
| :? CharClass.CharClass as charClass -> | |
match charClass with | |
| CharClass.InUnicodeBlock block -> $@"\p{{{block}}}" | |
| CharClass.LetterOrDigit -> @"\w" | |
| CharClass.WhitespaceChar -> @"\s" | |
| CharClass.Digit -> @"\d" | |
| :? CharClass.Negated as negated -> | |
let (CharClass.Negated charClass) = negated | |
match charClass with | |
| CharClass.InUnicodeBlock block -> $@"\P{{{block}}}" | |
| CharClass.LetterOrDigit -> @"\W" | |
| CharClass.WhitespaceChar -> @"\S" | |
| CharClass.Digit -> @"\D" | |
| :? Quantity.Quantified as quantified -> | |
match quantified with | |
| Quantity.Greedy (regex,quantity) -> | |
let regex = regex |> groupable |> evaluate | |
match quantity with | |
| Quantity.Exactly amount -> | |
$@"{regex}{{{amount}}}" | |
| Quantity.AtLeast amount -> | |
if amount = 0 then | |
$@"{regex}*" | |
else if amount = 1 then | |
$@"{regex}+" | |
else | |
$@"{regex}{{{amount},}}" | |
| Quantity.Between (min,max) -> | |
if min = 0 && max = 1 then | |
$@"{regex}?" | |
else | |
$@"{regex}{{{min},{max}}}" | |
| Quantity.Lazy (regex,quantity) -> | |
let greedQuantified = Quantity.Greedy(regex,quantity) |> evaluate | |
greedQuantified + "?" | |
| :? Condition.Conditional as conditional -> | |
let evaluateCondition = function | |
| Condition.Regex regex -> regex |> evaluate | |
| Condition.Reference reference -> reference |> referenceString | |
$@"(?({conditional.If |> evaluateCondition}){conditional.Then |> evaluate}|{conditional.Else |> evaluate})" | |
| _ -> failwith "Not supported" |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
module Examples | |
open Krow.Regex | |
let bounded (regex:IRegex) = | |
Anchor.Start + regex + Anchor.End | |
let lineBounded (regex:IRegex) = | |
Anchor.StartOfLine + regex + Anchor.EndOfLine | |
let separatedList separator (regex:IRegex) = | |
regex + (separator + regex) * (0,()) | |
module Guid = | |
let hexDigit = CharSet.Range('0', '9') / CharSet.Range('a', 'f') | |
let guid = | |
Regex.Sequence [ | |
hexDigit * 8 + "-" | |
hexDigit * 4 + "-" | |
CharSet.Range('1', '5') + "-" | |
CharSet.OneOf ['8';'9';'a';'b'] | |
hexDigit * 3 + "-" | |
hexDigit * 12 | |
] | |
module Email = | |
let allowedSpecialChars = CharSet.OneOf [ | |
'!';'#';'$';'%';'&';''';'*';'+';'/';'=';'?';'^';'_';'`';'{';'|';'}';'~';'-' | |
] | |
let alphaNumeric = CharSet.Range('a','z') / CharSet.Range('0','9') | |
let alphaNumericOrHyphen = alphaNumeric / "-" | |
module Hex = | |
let group1 = | |
["01";"08";"0B";"0C";"0E";"1F";"21";"23";"5B";"5D";"7F"] | |
|> List.map (fun a -> SpecialChar.Hexadecimal a :> IRegex) | |
|> Regex.OneOf | |
let group2 = | |
["01";"09";"0B";"0C";"0E";"7F"] | |
|> List.map (fun a -> SpecialChar.Hexadecimal a :> IRegex) | |
|> Regex.OneOf | |
let part = group1 / ( @"\" + group2) | |
let name = part * (0,()) | |
let quotedName = "\"" + name + "\"" | |
module User = | |
let stringPart = (alphaNumeric / allowedSpecialChars) * (1,()) | |
let stringName = separatedList "." stringPart | |
let name = stringName / Hex.quotedName | |
module Ip = | |
let ipPart = | |
Regex.OneOf [ | |
"25" + CharSet.Range('0','5') | |
"2" + CharSet.Range('0','4') + CharSet.Range('0','9') | |
CharSet.OneOf ['0';'1'] + CharSet.Range('0','9') + CharSet.Range('0','9') | |
] | |
let hexPart = | |
alphaNumericOrHyphen * (0,()) + alphaNumeric + ":" + Hex.name | |
let lastPart = ipPart / hexPart | |
let address = "[" + (ipPart + ".") * 3 + lastPart + "]" | |
module Domain = | |
let part = (alphaNumeric * (1,())) |> separatedList "-" | |
let name = part |> separatedList "." | |
let email = bounded (User.name + "@" + (Ip.address / Domain.name)) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
[<AutoOpen>] | |
module Krow.Regex.Operators | |
open Regex | |
type RegexSequence = RegexSequence with | |
static member (?<-) (RegexSequence, first:Sequence, second:Sequence) = | |
let (Sequence list1) = first | |
let (Sequence list2) = second | |
Sequence(list1 @ list2) | |
static member (?<-) (RegexSequence, first:IRegex, second:Sequence) = | |
(?<-) RegexSequence (Sequence [first]) second | |
static member (?<-) (RegexSequence, first:Sequence, second:IRegex) = | |
(?<-) RegexSequence first (Sequence [second]) | |
static member (?<-) (RegexSequence, first:IRegex, second:IRegex) = | |
(?<-) RegexSequence (Sequence [first]) (Sequence [second]) | |
static member (?<-) (RegexSequence, first:string, second:IRegex) = | |
(?<-) RegexSequence (Sequence [Literal first]) (Sequence [second]) | |
static member (?<-) (RegexSequence, first:IRegex, second:string) = | |
(?<-) RegexSequence (Sequence [first]) (Sequence [Literal second]) | |
static member inline (?<-) (RegexSequence, first, second) = | |
first + second | |
let inline (+) first second : 'R = ( (?<-) RegexSequence first second) | |
type RegexOneOf = RegexOneOf with | |
static member (?<-) (RegexOneOf, first:OneOf, second:OneOf) = | |
let (OneOf list1) = first | |
let (OneOf list2) = second | |
OneOf(list1 @ list2) | |
static member (?<-) (RegexOneOf, first:IRegex, second:OneOf) = | |
(?<-) RegexOneOf (OneOf [first]) second | |
static member (?<-) (RegexOneOf, first:OneOf, second:IRegex) = | |
(?<-) RegexOneOf first (OneOf [second]) | |
static member (?<-) (RegexOneOf, first:IRegex, second:IRegex) = | |
OneOf [first;second] | |
static member (?<-) (RegexOneOf, first:CharSet.CharSet, second:CharSet.CharSet) = | |
match first,second with | |
| CharSet.Multiple charsets1, CharSet.Multiple charsets2 -> | |
CharSet.Multiple (charsets1 @ charsets2) | |
| CharSet.Multiple charsets1, charset2 -> | |
CharSet.Multiple (charsets1 @ [charset2]) | |
| charset1, CharSet.Multiple charsets2 -> | |
CharSet.Multiple (charset1 :: charsets2) | |
| charset1, charset2 -> | |
CharSet.Multiple [charset1;charset2] | |
static member (?<-) (RegexOneOf, CharSet.Negated first, CharSet.Negated second) = | |
CharSet.Negated ((?<-) RegexOneOf first second) | |
static member (?<-) (RegexOneOf, first:string, second:IRegex) = | |
(?<-) RegexOneOf (OneOf [Literal first]) (OneOf [second]) | |
static member (?<-) (RegexOneOf, first:IRegex, second:string) = | |
(?<-) RegexOneOf (OneOf [first]) (OneOf [Literal second]) | |
static member inline (?<-) (RegexOneOf, first, second) = | |
first / second | |
let inline (/) first second : 'R = ( (?<-) RegexOneOf first second) | |
type RegexQuantification = RegexQuantification with | |
static member (?<-) (RegexQuantification, regex, quantity) = | |
Quantity.Greedy(regex, Quantity.Exactly quantity) | |
static member (?<-) (RegexQuantification, regex, quantity) = | |
Quantity.Greedy(regex, Quantity.Between quantity) | |
static member (?<-) (RegexQuantification, regex, quantity) = | |
let quantity, () = quantity | |
Quantity.Greedy(regex, Quantity.AtLeast quantity) | |
static member inline (?<-) (RegexQuantification, first, second) = | |
first * second | |
let inline ( * ) first second : 'R = ( (?<-) RegexQuantification first second) | |
type RegexLazyQuantification = RegexLazyQuantification with | |
static member (?<-) (RegexLazyQuantification, regex, quantity) = | |
Quantity.Lazy(regex, Quantity.Exactly quantity) | |
static member (?<-) (RegexLazyQuantification, regex, quantity) = | |
Quantity.Lazy(regex, Quantity.Between quantity) | |
static member (?<-) (RegexLazyQuantification, regex, quantity) = | |
let quantity, () = quantity | |
Quantity.Lazy(regex, Quantity.AtLeast quantity) | |
static member inline (?<-) (RegexLazyQuantification, first, second) = | |
first *? second | |
let inline ( *? ) first second : 'R = ( (?<-) RegexLazyQuantification first second) | |
type RegexNegation = RegexNegation with | |
static member (?<-) (RegexNegation, charClass:CharClass.CharClass, _) = | |
CharClass.Negated charClass | |
static member (?<-) (RegexNegation, charClass:CharClass.Negated, _) = | |
let (CharClass.Negated charClass) = charClass | |
charClass | |
static member (?<-) (RegexNegation, charClass:CharSet.CharSet, _) = | |
CharSet.Negated charClass | |
static member (?<-) (RegexNegation, charClass:CharSet.Negated, _) = | |
let (CharSet.Negated charClass) = charClass | |
charClass | |
static member (?<-) (RegexNegation, look:Look.Look, _) = | |
Look.Negated look | |
static member (?<-) (RegexNegation, look:Look.Negated, _) = | |
let (Look.Negated look) = look | |
look | |
static member inline (?<-) (RegexNegation, first, _) = | |
!first | |
let inline (!) first : 'R = ( (?<-) RegexNegation first ()) | |
let aaa = !(Look.Ahead (Literal "aa")) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
[<AutoOpen>] | |
module Krow.Regex.Types | |
type IRegex = interface end | |
module Regex = | |
type Literal = | |
| Literal of string | |
interface IRegex | |
type Raw = | |
| Raw of string | |
interface IRegex | |
type OneOf = | |
| OneOf of IRegex list | |
interface IRegex | |
type Sequence = | |
| Sequence of IRegex list interface IRegex | |
type internal NonCapturing = | |
| NonCapturing of IRegex | |
interface IRegex | |
module Mode = | |
type Mode = | |
| CaseInsensitive | |
| Multiline | |
| ExplicitCapture | |
| IgnoreUnescapedWhiteSpace | |
type WithModes = | |
| WithModes of Mode list * IRegex | |
interface IRegex | |
module Look = | |
type Look = | |
| Ahead of IRegex | |
| Behind of IRegex | |
interface IRegex | |
type Negated = | |
|Negated of Look interface IRegex | |
module Group = | |
type Reference = | |
| Positional of int | |
| Named of string | |
interface IRegex | |
type Kind = | |
// Capturing | |
| Capturing | |
| CapturingWithName of string | |
// Non capturing | |
| NonBacktrackingGrouping | |
// Balancing | |
| UnCapturing of Reference // Balancing while omitting first arg | |
| Balancing of string * Reference | |
type Group = | |
| Group of Kind * IRegex | |
interface IRegex | |
module SpecialChar = | |
type SpecialChar = | |
| WildCard | |
| Bell | |
| Backspace | |
| Tab | |
| VerticalTab | |
| CarriageReturn | |
| NewLine | |
| Escaped | |
| Octal of string | |
| Hexadecimal of string | |
| ASCII of string | |
interface IRegex | |
module Anchor = | |
type Anchor = | |
| Start | |
| StartOfLine | |
| End | |
| EndOfLine | |
| Boundary | |
| NotBoundary | |
| AfterMatch | |
interface IRegex | |
module CharSet = | |
type CharSet = | |
| OneOf of char list | |
| Range of char * char | |
| Multiple of CharSet list | |
interface IRegex | |
type Negated = | |
| Negated of CharSet | |
interface IRegex | |
module CharClass = | |
type CharClass = | |
| InUnicodeBlock of string | |
| LetterOrDigit | |
| WhitespaceChar | |
| Digit | |
interface IRegex | |
type Negated = | |
| Negated of CharClass | |
interface IRegex | |
module Quantity = | |
type Quantity = | |
| Exactly of int | |
| AtLeast of int | |
| Between of int * int | |
type Quantified = | |
| Greedy of IRegex * Quantity | |
| Lazy of IRegex * Quantity | |
interface IRegex | |
module Condition = | |
type Condition = | |
| Regex of IRegex | |
| Reference of Group.Reference | |
type Conditional = | |
{ If: Condition; Then: IRegex; Else: IRegex } | |
interface IRegex |
With a couple of active patterns:
let (|Regex|_|) pattern input =
if input = null then None else
try
let m = RegularExpressions.Regex.Match(input, pattern)
if m.Success then Some(List.tail [ for g in m.Groups -> g.Value ])
else None
with e -> None
let (|Regexs|) pattern input =
if input = null then [| |] else
try
let ms = RegularExpressions.Regex.Matches(input, pattern)
[| for m in ms do yield (List.tail [ for g in m.Groups -> g.Value ]) |]
with e -> [| |]
then it can be used very nicely like this:
match parseErrWarnInfo with Regexs errWarnInfo r -> printfn "Matches found:\n%A" r
match parseErrWarnInfo with
| Regex errWarnInfo r -> printfn "First Match: %A" r
| _ -> printfn "No match found"
match parseErrWarnInfo with Regexs errWarnInfo r -> printfn "Matches found:\n%A" r
// Matches found:
// [|["Err"; "1"; "7"; "1"; "12"; "This shows over there as an error"];
// ["Warn"; "2"; "7"; "2"; "12"; "This shows over there as a warning"];
// ["Info"; "3"; "7"; "3"; "12"; "This shows over there as information"]|]
match parseErrWarnInfo with
| Regex errWarnInfo r -> printfn "First Match: %A" r
| _ -> printfn "No match found"
// First Match: ["Err"; "1"; "7"; "1"; "12"; "This shows over there as an error"]
Guid:
let hexDigit = InRange('0', '9') / InRange('a', 'f')
let hexDigits n = Exactly(uint32 n, hexDigit)
Sequence [
hexDigits 8 + "-"
hexDigits 4 + "-"
InRange('1', '5')
hexDigits 3 + "-"
oneOf "89ab"
hexDigits 3 + "-"
hexDigits 12
]
|> evaluate
|> printfn "%s"
// (?:[0-9]|[a-f]){8}-(?:[0-9]|[a-f]){4}-[1-5](?:[0-9]|[a-f]){3}-[89ab](?:[0-9]|[a-f]){3}-(?:[0-9]|[a-f]){12}
email:
let allowed = NotOneOfEscaped (escape "<>()[].,;:@" + evaluate WhitespaceChar) |> MoreThanOnce
let listSep sep elems = elems + ManyTimesOrNone (Literal sep + elems)
listSep "." allowed + "@" + listSep "." allowed
|> evaluate
|> printfn "%s"
in this case allowed
needed to include \s
without further escaping. For this case I added:
...
| LiteralRegex of string
| OneOfEscaped of string
| NotOneOfEscaped of string
...
| LiteralRegex rx -> rx
| OneOfEscaped string -> sprintf @"[%s]" string
| NotOneOfEscaped string -> sprintf @"[^%s]" string
...
to allow for cases not contemplated or for composing with regex from other sources.
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
It is working very well.
Except for
|||
that doesn't seem to overload like+
does.Literal "this" ||| Literal "that"
worksLiteral "this" ||| "that"
doesn't workI'm not sure why
So I changed to '/'. I know is not symmetrical and it doesn't seem commutative. OTOH it confers the meaning well.
For instance:
Literal "Err" / "Warn" / "Info"
is very readable.Also
RegularExpressions.Regex.Escape
doesn't escape the character]
which is a problem when doing OneOf, so I changed it to:Here is an example:
I also changed the operator for
Either
so it preserves the original order. I doesn't make a difference to the regex, but the rearranging was a little disconcerting.