Created
May 5, 2022 10:40
-
-
Save soegaard/b1b680702a0c05e2f2fb961565d50dfb to your computer and use it in GitHub Desktop.
Regular Expression Matching for S-expressions
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#lang racket | |
;;; | |
;;; Regular Expression Matching for S-expressions | |
;;; | |
;; Exports | |
(provide regular-match ; the main form | |
named-groups) ; holds immutable hash table of named groups | |
;; Imports | |
(require (for-syntax racket/base syntax/parse racket/syntax racket/format)) | |
;; Literals used in the syntax of regular expressions | |
(define-syntax ? (λ (stx) (raise-syntax-error '? "used out of context" stx))) | |
(define-syntax ^ (λ (stx) (raise-syntax-error '^ "used out of context" stx))) ; begin | |
(define-syntax $ (λ (stx) (raise-syntax-error '$ "used out of context" stx))) ; end | |
(define-syntax *? (λ (stx) (raise-syntax-error '*? "used out of context" stx))) ; non-greedy star | |
(define-syntax +? (λ (stx) (raise-syntax-error '+? "used out of context" stx))) ; non-greedy plus | |
(define-syntax group (λ (stx) (raise-syntax-error 'group "used out of context" stx))) | |
(define-syntax back (λ (stx) (raise-syntax-error 'back "used out of context" stx))) | |
;; The grammar for regular expressions | |
(begin-for-syntax | |
(define-syntax-class Sequence ; aka concatenation | |
#:description "regular expression matching a sequence of regular expressions" | |
(pattern (r1:Regexp ...))) | |
(define-syntax-class Or ; aka alternation | |
#:description "regular expression matching a at least one of the regular expressions" | |
#:literals (/) | |
(pattern (/ r1:Regexp ...))) | |
(define-syntax-class Star ; aka Kleene star | |
#:description "regular expression matching zero or more occurrences: (* regexp)" | |
#:literals (*) | |
(pattern (* r:Regexp))) | |
(define-syntax-class StarNonGreedy | |
#:description "regular expression matching zero or more occurrences: (*? regexp)" | |
#:literals (*?) | |
(pattern (*? r:Regexp))) | |
(define-syntax-class Plus | |
#:description "regular expression matching one or more occurrences: (+ regexp]" | |
#:literals (+) | |
(pattern (+ r:Regexp))) | |
(define-syntax-class PlusNonGreedy | |
#:description "regular expression matching one or more occurrences: (+? regexp]" | |
#:literals (+?) | |
(pattern (+? r:Regexp))) | |
(define-syntax-class Question | |
#:description "regular expression matching zero or one occurrences: (? regexp)" | |
#:literals (?) | |
(pattern (? r:Regexp))) | |
(define-syntax-class Beginning | |
#:description "regular expression matching the beginning of the input" | |
#:literals (^) | |
(pattern ^)) | |
(define-syntax-class End | |
#:description "regular expression matching the end of the input" | |
#:literals ($) | |
(pattern $)) | |
(define-syntax-class Something | |
#:description "regular expression matching anything" | |
#:literals (_) | |
(pattern _)) | |
(define-syntax-class Group | |
#:description "named group" | |
#:literals (group) | |
(pattern (group name:id r1:Regexp ...))) | |
(define-syntax-class Back | |
#:description "back reference" | |
#:literals (back) | |
(pattern (back name:id))) | |
(define-syntax-class Literal | |
#:description "regular expression match a specific literal" | |
(pattern (~or r:number | |
r:char | |
r:string | |
r:boolean | |
r:id))) | |
(define-syntax-class Regexp | |
#:description "Regular Expression" | |
(pattern (~or r:Sequence | |
r:Or | |
r:Star | |
r:StarNonGreedy | |
r:Plus | |
r:PlusNonGreedy | |
r:Question | |
r:Beginning | |
r:End | |
r:Something | |
r:Literal | |
r:Group | |
r:Back | |
))) | |
(define-syntax-class MatchClause | |
#:description "Clause with expression to be matched and corresponding result" | |
(pattern [e:expr result:expr])) | |
(define-syntax-class ElseClause | |
#:description "Clause with expression to be use when no values matched" | |
#:literals (else) | |
(pattern [else result:expr]))) | |
;; General Utilities | |
; Example: | |
; > (for/list ([xs (in-tails '(a b c))]) xs) | |
; '((a b c) (b c) (c) ()) | |
(define (in-tails xs) | |
(make-do-sequence | |
(λ () | |
(define (pos->element p) p) | |
(define (next-position p) (if (null? p) #f (cdr p))) | |
(define initial-position xs) | |
(define (continue-with-pos? p) p) | |
(values pos->element | |
next-position | |
initial-position | |
continue-with-pos? | |
#f #f)))) | |
;; Predicates | |
(define (beginning? re) (eq? re '^)) | |
(define (end? re) (eq? re '$)) | |
(define (something? re) (eq? re '_)) | |
(define (literal? re) (or (number? re) (char? re) (string? re) (boolean? re) (symbol? re))) | |
(define (star? re) (and (pair? re) (eq? (car re) '*))) | |
(define (plus? re) (and (pair? re) (eq? (car re) '+))) | |
(define (question? re) (and (pair? re) (eq? (car re) '?))) | |
(define (or? re) (and (pair? re) (eq? (car re) '/))) | |
(define (star/ng? re) (and (pair? re) (eq? (car re) '*?))) | |
(define (plus/ng? re) (and (pair? re) (eq? (car re) '+?))) | |
(define (group? re) (and (pair? re) (eq? (car re) 'group))) | |
(define (back? re) (and (pair? re) (eq? (car re) 'back))) | |
(define (sequence/beginning? re) (and (pair? re) (beginning? (car re)))) | |
;; Accessors | |
(define (skip re) (cdr re)) ; skip first regexp in a sequence | |
(define (sub re) (cadr re)) ; get sub expression in (* re), (+ re) etc. | |
(define (alts re) (cdr re)) ; get alternatives in (or re1 re2 ...) | |
(define (group-name re) (cadr re)) | |
(define (group-sequence re) (cddr re)) | |
(define (back-name re) (cadr re)) | |
;; Parameters | |
; Uses as a way to output information on named groups | |
(define named-groups (make-parameter (make-immutable-hash))) | |
;; The Matcher | |
(define (match-anywhere res input) | |
; (displayln (list 'match-anywhere res input)) | |
; Note: We can assume that res is a legal sequence regular expression, | |
; since its syntax has been checked by syntax-parse. | |
(named-groups (make-immutable-hash)) | |
(if (sequence/beginning? res) | |
(match-here (skip res) input) | |
(for/or ([in (in-tails input)]) | |
; (display "> ") | |
(begin0 | |
(match-here res in) | |
#;(newline))))) | |
(define (match-here res input) | |
; (displayln (list 'match-here res input)) | |
(if (empty? res) input | |
(let ([re (first res)] [res (skip res)]) | |
(or (and (end? re) (empty? res) (empty? input) '()) | |
(and (something? re) (not (empty? input)) (match-here res (skip input))) | |
(and (star? re) (match-star (sub re) res input)) | |
(and (star/ng? re) (match-star/ng (sub re) res input)) | |
(and (plus? re) (match-plus (sub re) res input)) | |
(and (plus/ng? re) (match-plus/ng (sub re) res input)) | |
(and (question? re) (match-question (sub re) res input)) | |
(and (group? re) (match-group (group-name re) (group-sequence re) res input)) | |
(and (back? re) (match-back (back-name re) res input)) | |
(and (literal? re) (not (empty? input)) | |
(equal? (first input) re) (match-here res (skip input))) | |
(and (or? re) (match-or (alts re) res input)) | |
; if re is a list, it is a sequence of regular expressions | |
(and (list? re) (match-here (append re res) input)) | |
)))) | |
(define (match-star re res input) ; greedy | |
; (displayln (list 'match-star re res input)) | |
; match re*res at the beginning of input | |
(let loop ([input input]) | |
(let ([input+ (match-here (list re) input)]) | |
(or (and input+ (match-star re res input+)) | |
(match-here res input))))) | |
(define (match-star/ng re res input) ; non-greedy: shortest first | |
; (displayln (list 'match-star re res input)) | |
; match re*res at the beginning of input | |
(or (match-here res input) | |
(let ([input+ (match-here (list re) input)]) | |
(and input+ | |
(match-star re res input+))))) | |
(define (match-plus re res input) ; greedy | |
; (displayln (list 'match-plus re res input)) | |
; match re+res at the beginning of input | |
(let ([input+ (match-here (list re) input)]) | |
(and input+ | |
(match-star re res input+)))) | |
(define (match-plus/ng re res input) ; non greedy | |
; (displayln (list 'match-plus re res input)) | |
; match re+res at the beginning of input | |
(let ([input+ (match-here (list re) input)]) | |
(and input+ | |
(match-star/ng re res input+)))) | |
(define (match-question re res input) ; greedy | |
; (displayln (list 'match-question re res input)) | |
; match zero or one occurence of re at the beginning of the input | |
(let ([input+ (match-here (list re) input)]) | |
(or (and input+ (match-here res input+)) | |
(match-here res input)))) | |
(define (match-or alts res input) ; greedy | |
; (displayln (list 'match-or alts res input)) | |
; match one of the regular expressions in alts at the beginning of input | |
(if (empty? alts) | |
(match-here res input) | |
(for/or ([alt alts]) | |
(define input+ (match-here (list alt) input)) | |
(and input+ (match-here res input+))))) | |
(define (match-group name gres res input) | |
; (displayln (list 'match-group name gres res input)) | |
(define new-groups #f) | |
(define input+ (match-here gres input)) | |
(define input1 (and input+ | |
(let ([matched (take input (- (length input) (length input+)))]) | |
(parameterize ([named-groups (hash-set (named-groups) name matched)]) | |
(define input++ (match-here res input+)) | |
(when input++ | |
(set! new-groups (named-groups))) | |
input++)))) | |
(when input1 | |
(named-groups new-groups)) | |
input1) | |
(define (match-back name res input) | |
; (displayln (list 'match-back name res input)) | |
(define captured (hash-ref (named-groups) name (box #f))) | |
(if (box? captured) | |
(match-here res input) ; should a non-existant back reference give an error? | |
(let ([input+ (match-here captured input)]) ; clean captured first? | |
(and input+ | |
(match-here res input+))))) | |
(define-syntax (regular-match stx) | |
(syntax-parse stx | |
[(_regular-match r:Sequence mc:MatchClause ... ec:ElseClause) | |
(with-syntax ([([to-match result] ...) #'(mc ...)] | |
[[_ else-result] #'ec]) | |
(syntax/loc stx | |
(cond | |
[(match-anywhere 'r to-match) result] | |
... | |
[else else-result])))] | |
[(_regular-match r:Sequence mc:MatchClause ...) | |
(syntax/loc stx | |
(regular-match r mc ... [else (void)]))])) | |
;; Tests | |
; All tests are supposed to return #t. | |
#;(and | |
(regular-match (_) ['(1) #t] [else #f]) ; match something somewhere | |
(regular-match (_) ['(1 2) #t] [else #f]) ; ditto | |
(regular-match (_) ['() #f] [else #t]) ; something can't match nothing - but an empty list is not nothing | |
(regular-match (^ _) ['(1) #t] [else #f]) ; match something at the beginning | |
(regular-match (^ _) ['(1 2) #t] [else #f]) ; ditto | |
(regular-match (^ _) ['() #f] [else #t]) ; something can't match nothing | |
(regular-match (^ _ $) ['(1) #t] [else #f]) ; match something at the beginning and end | |
(regular-match (^ _ $) ['(1 2) #f] [else #t]) ; 1 is not at end, 2 is not at beginning | |
(regular-match (^ _ $) ['() #f] [else #t]) ; something can't match nothing | |
(regular-match (_ $) ['(1) #t] [else #f]) ; match something at the end | |
(regular-match (_ $) ['(1 2) #t] [else #f]) ; 1 is not at end, but 2 is | |
(regular-match (_ $) ['() #f] [else #t]) ; something can't match nothing | |
(regular-match (^ 1 $) ['(1) #t] [else #f]) ; match 1 at the beginning and end | |
(regular-match (^ 1 $) ['(2) #f] [else #t]) ; 1 does not match 2 | |
(regular-match (1 $) ['(1) #t] [else #t]) ; match 1 at the end | |
(regular-match (1 $) ['(2) #f] [else #t]) ; 1 does not match 2 | |
(regular-match (1 2 3) ['(1 2 3) #t] [else #f]) ; match 1 2 3 somewhere | |
(regular-match (1 2 3) ['(1 2 3 4) #t] [else #f]) ; match 1 2 3 somewhere | |
(regular-match (1 2 3) ['(0 1 2 3) #t] [else #f]) ; match 1 2 3 somewhere | |
(regular-match (1 2 3) ['(0 1 2 3 4) #t] [else #f]) ; match 1 2 3 somewhere | |
(regular-match (1 2 3) ['(0 1 2 5 3) #f] [else #t]) ; match 1 2 3 somewhere | |
(regular-match (^ (* 1) $) ['() #t] [else #f]) ; match 1 zero or more times | |
(regular-match (^ (* 1) $) ['(1) #t] [else #f]) ; match 1 zero or more times | |
(regular-match (^ (* 1) $) ['(1 1) #t] [else #f]) ; match 1 zero or more times | |
(regular-match (^ (* 1) $) ['(1 1 1) #t] [else #f]) ; match 1 zero or more times | |
(regular-match ((* 1) $) ['(0 1 1) #t] [else #f]) ; match 1 zero or more times at the end | |
(regular-match (^ (* 1)) ['(1 1 2) #t] [else #f]) ; match 1 zero or more times at the beginning | |
(regular-match (1 (* _) 2) ['() #f] [else #t]) ; match 1 eventually followed by 2 | |
(regular-match (1 (* _) 2) ['(1) #f] [else #t]) ; match 1 eventually followed by 2 | |
(regular-match (1 (* _) 2) ['(1 3) #f] [else #t]) ; match 1 eventually followed by 2 | |
(regular-match (1 (* _) 2) ['(1 2) #t] [else #f]) ; match 1 eventually followed by 2 | |
(regular-match (1 (* _) 2) ['(1 3 2) #t] [else #f]) ; match 1 eventually followed by 2 | |
(regular-match (1 (* _) 2) ['(1 3 3 2) #t] [else #f]) ; match 1 eventually followed by 2 | |
(regular-match (1 (* _) 2) ['(0 1 3 3 2 5) #t] [else #f]) ; match 1 eventually followed by 2 | |
(regular-match (^ (*? 1) $) ['() #t] [else #f]) ; match 1 zero or more times | |
(regular-match (^ (*? 1) $) ['(1) #t] [else #f]) ; match 1 zero or more times | |
(regular-match (^ (*? 1) $) ['(1 1) #t] [else #f]) ; match 1 zero or more times | |
(regular-match (^ (*? 1) $) ['(1 1 1) #t] [else #f]) ; match 1 zero or more times | |
(regular-match ((*? 1) $) ['(0 1 1) #t] [else #f]) ; match 1 zero or more times at the end | |
(regular-match (^ (*? 1)) ['(1 1 2) #t] [else #f]) ; match 1 zero or more times at the beginning | |
(regular-match (1 (*? _) 2) ['() #f] [else #t]) ; match 1 eventually followed by 2 | |
(regular-match (1 (*? _) 2) ['(1) #f] [else #t]) ; match 1 eventually followed by 2 | |
(regular-match (1 (*? _) 2) ['(1 3) #f] [else #t]) ; match 1 eventually followed by 2 | |
(regular-match (1 (*? _) 2) ['(1 2) #t] [else #f]) ; match 1 eventually followed by 2 | |
(regular-match (1 (*? _) 2) ['(1 3 2) #t] [else #f]) ; match 1 eventually followed by 2 | |
(regular-match (1 (*? _) 2) ['(1 3 3 2) #t] [else #f]) ; match 1 eventually followed by 2 | |
(regular-match (1 (*? _) 2) ['(0 1 3 3 2 5) #t] [else #f]) ; match 1 eventually followed by 2 | |
(regular-match (^ (+ 1) $) ['() #f] [else #t]) ; match 1 zero or more times | |
(regular-match (^ (+ 1) $) ['(1) #t] [else #f]) ; match 1 zero or more times | |
(regular-match (^ (+ 1) $) ['(2) #f] [else #t]) ; match 1 zero or more times | |
(regular-match (^ (+ 1) $) ['(1 1) #t] [else #f]) ; match 1 zero or more times | |
(regular-match (^ (+ 1) $) ['(1 1 1) #t] [else #f]) ; match 1 zero or more times | |
(regular-match ((+ 1) $) ['(0 1 1) #t] [else #f]) ; match 1 zero or more times at the end | |
(regular-match (^ (+ 1)) ['(1 1 2) #t] [else #f]) ; match 1 zero or more times at the beginning | |
(regular-match (1 (+ _) 2) ['() #f] [else #t]) ; match 1, something eventually followed by 2 | |
(regular-match (1 (+ _) 2) ['(1) #f] [else #t]) ; | |
(regular-match (1 (+ _) 2) ['(1 3) #f] [else #t]) ; | |
(regular-match (1 (+ _) 2) ['(1 2) #f] [else #t]) ; | |
(regular-match (1 (+ _) 2) ['(1 3 2) #t] [else #f]) ; | |
(regular-match (1 (+ _) 2) ['(1 3 3 2) #t] [else #f]) ; | |
(regular-match (1 (+ _) 2) ['(0 1 3 3 2 5) #t] [else #f]) ; | |
(regular-match (1 (? _) 2) ['() #f] [else #t]) ; match 1 2 maybe something between | |
(regular-match (1 (? _) 2) ['(1) #f] [else #t]) ; | |
(regular-match (1 (? _) 2) ['(1 3) #f] [else #t]) ; | |
(regular-match (1 (? _) 2) ['(1 2) #t] [else #f]) ; | |
(regular-match (1 (? _) 2) ['(1 3 2) #t] [else #f]) ; | |
(regular-match (1 (? _) 2) ['(1 3 3 2) #f] [else #t]) ; | |
(regular-match (1 (? _) 2) ['(0 1 3 3 2 5) #f] [else #t]) ; | |
(regular-match (^ (? 1) $) ['() #t] [else #f]) ; match 1 zero or one times | |
(regular-match (^ (? 1) $) ['(1) #t] [else #f]) ; match 1 zero or one times | |
(regular-match (^ (? 1) $) ['(2) #f] [else #t]) ; match 1 zero or one times | |
(regular-match (^ (? 1) $) ['(1 1) #f] [else #t]) ; match 1 zero or one times | |
(regular-match (^ (? 1) $) ['(1 1 1) #f] [else #t]) ; match 1 zero or one times | |
(regular-match ((? 1) $) ['(0 1 1) #t] [else #f]) ; match 1 zero or one times at the end | |
(regular-match ((? 1) $) ['(0 2 2) #t] [else #f]) ; match 1 zero or one times at the end | |
(regular-match (^ (? 1)) ['(1 1 2) #t] [else #f]) ; match 1 zero or one times at the beginning | |
(regular-match (^ (? 1)) ['(2 2 2) #t] [else #f]) ; match 1 zero or one times at the beginning | |
(regular-match (^ (/ 1 2) $) ['() #f] [else #t]) ; match 1 or 2 | |
(regular-match (^ (/ 1 2) $) ['(1) #t] [else #f]) ; match 1 or 2 | |
(regular-match (^ (/ 1 2) $) ['(2) #t] [else #f]) ; match 1 or 2 | |
(regular-match (^ (/ 1 2) $) ['(3) #f] [else #t]) ; match 1 or 2 | |
(regular-match (^ (/ 1 2) $) ['(1 3) #f] [else #t]) ; match 1 or 2 | |
(regular-match (^ (/ 1 2) $) ['(1 2) #f] [else #t]) ; match 1 or 2 | |
(regular-match (^ (group a 1) 2) ['(1 2) #t] [else #f]) | |
(regular-match (^ (group a (* 1)) 2 $) ['(1 1 2) #t] [else #f]) | |
(regular-match (^ (group a (* 1)) 2 (back a) $) ['(1 1 2 1 1) #t] [else #f]) ; match same number before and after 2 | |
(regular-match (^ (group a (* 1)) 2 (back a) $) ['(1 1 2 1 1 1) #f] [else #t]) ; match same number before and after 2 | |
(regular-match (^ (group a (* 1)) (group b (* 2)) (back a) (back b) $) ['(1 1 2 1 1 2) #t] [else #f]) | |
(regular-match (^ (group a (* 1)) (group b (* 2)) (back a) (back b) $) ['(1 1 2 1 1 2 2) #f] [else #t]) | |
(regular-match (^ (group a _) (group as (* (back a))) | |
(group b _) (group bs (* (back b))) | |
(back a) (back as) | |
(back b) (back bs) $) ['(1 1 2 2 2 1 1 2 2 2) #t] [else #f]) | |
(regular-match (^ (group a _) (group as (* (back a))) | |
(group b _) (group bs (* (back b))) | |
(back a) (back as) | |
(back b) (back bs) $) ['(1 1 2 2 2 1 1 2 2 2 2) #f] [else #t]) | |
; check that *? and * are non-greedy and greedy respectively | |
(equal? (match-here '((*? 1)) '(1 1 1)) '(1 1 1)) ; non greedy | |
(equal? (match-here '((* 1)) '(1 1 1)) '()) ; greedy | |
; check that +? and + are non-greedy and greedy respectively | |
(equal? (match-here '((+? 1)) '(1 1 1)) '(1 1)) ; non greedy | |
(equal? (match-here '((+ 1)) '(1 1 1)) '()) ; greedy | |
) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment