Created
May 19, 2020 00:30
-
-
Save bored-engineer/89e7f9810e9f21bcb72ac7df5dd8e4a1 to your computer and use it in GitHub Desktop.
A go program to expand a regular expression into every possible matching string
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package main | |
import ( | |
"os" | |
"fmt" | |
"strings" | |
"regexp/syntax" | |
"unicode/utf8" | |
) | |
// Maximum number of repeats if unterminated (ex: *, +) | |
var MaxRepeats = 10 | |
// syntax.OpNoMatch | |
func opNoMatch(pattern *syntax.Regexp) []string { | |
return nil | |
} | |
// syntax.OpEmptyMatch, syntax.OpBeginLine, syntax.OpEndLine, syntax.OpBeginText, syntax.OpEndText | |
func opEmpty(pattern *syntax.Regexp) []string { | |
return []string{""} | |
} | |
// syntax.OpLiteral | |
func opLiteral(pattern *syntax.Regexp) []string { | |
// Literal is returned as-is, just 1 value | |
return []string{string(pattern.Rune)} | |
} | |
// syntax.OpCharClass | |
func opCharClass(pattern *syntax.Regexp) []string { | |
// pattern.Rune is already de-dupped | |
var lits []string | |
for idx := 0; idx < len(pattern.Rune); idx += 2 { | |
for r := pattern.Rune[idx]; r <= pattern.Rune[idx+1]; r++ { | |
lits = append(lits, string(r)) | |
} | |
} | |
return lits | |
} | |
// syntax.OpAnyCharNotNL | |
func opAnyCharNotNL(pattern *syntax.Regexp) []string { | |
lits := make([]string, utf8.MaxRune - 1) | |
idx := 0 | |
for r := 0; r < utf8.MaxRune; idx++ { | |
if rune(r) != rune('\n') { | |
lits[idx] = string(rune(idx)) | |
idx++ | |
} | |
} | |
return lits | |
} | |
// syntax.OpAnyChar | |
func opAnyChar(pattern *syntax.Regexp) []string { | |
lits := make([]string, utf8.MaxRune) | |
for idx := 0; idx < int(utf8.MaxRune); idx++ { | |
lits[idx] = string(rune(idx)) | |
} | |
return lits | |
} | |
// syntax.OpCapture | |
func opCapture(pattern *syntax.Regexp) ([]string, error) { | |
return expand(pattern.Sub[0]) | |
} | |
// syntax.OpStar | |
func opStar(pattern *syntax.Regexp) ([]string, error) { | |
pattern.Min = 0 | |
pattern.Max = MaxRepeats | |
return opRepeat(pattern) | |
} | |
// syntax.OpPlus | |
func opPlus(pattern *syntax.Regexp) ([]string, error) { | |
pattern.Min = 1 | |
pattern.Max = MaxRepeats | |
return opRepeat(pattern) | |
} | |
// syntax.OpQuest | |
func opQuest(pattern *syntax.Regexp) ([]string, error) { | |
lits, err := expand(pattern.Sub[0]) | |
if err != nil { | |
return nil, err | |
} | |
return append(lits, ""), nil | |
} | |
// syntax.OpRepeat | |
func opRepeat(pattern *syntax.Regexp) ([]string, error) { | |
lits, err := expand(pattern.Sub[0]) | |
if err != nil { | |
return nil, err | |
} | |
if pattern.Max == -1 { | |
pattern.Max = MaxRepeats | |
} | |
rlits := make([]string, len(lits) * (pattern.Max - pattern.Min)) | |
idx := 0 | |
for repeat := pattern.Min; repeat < pattern.Max; repeat++ { | |
if repeat == 0 { | |
rlits[idx] = "" | |
idx++ | |
rlits = rlits[:len(rlits) - len(lits) + 1] | |
continue | |
} | |
for _, lit := range lits { | |
rlits[idx] = strings.Repeat(lit, repeat) | |
idx++ | |
} | |
} | |
return rlits, nil | |
} | |
// syntax.OpConcat | |
func opConcat(pattern *syntax.Regexp) ([]string, error) { | |
// Ensure unique values | |
uniq := make(map[string]struct{}) | |
// recursively concat | |
var recur func(prefixes []string, patterns []*syntax.Regexp) error | |
recur = func(prefixes []string, patterns []*syntax.Regexp) error { | |
lits, err := expand(patterns[0]) | |
if err != nil { | |
return err | |
} | |
swap := make([]string, len(lits) * len(prefixes)) | |
idx := 0 | |
for _, prefix := range prefixes { | |
for _, lit := range lits { | |
swap[idx] = prefix + lit | |
idx++ | |
} | |
} | |
if len(patterns) <= 1 { | |
for _, lit := range swap { | |
uniq[lit] = struct{}{} | |
} | |
return nil | |
} else { | |
return recur(swap, patterns[1:]) | |
} | |
} | |
if err := recur([]string{""}, pattern.Sub); err != nil { | |
return nil, err | |
} | |
lits := make([]string, len(uniq)) | |
idx := 0 | |
for lit := range uniq { | |
lits[idx] = lit | |
idx++ | |
} | |
return lits, nil | |
} | |
// syntax.OpAlternate | |
func opAlternate(pattern *syntax.Regexp) ([]string, error) { | |
// Ensure unique values | |
uniq := make(map[string]struct{}) | |
for _, sub := range pattern.Sub { | |
// Each alternate stands on it's own | |
lits, err := expand(sub) | |
if err != nil { | |
return nil, err | |
} | |
for _, lit := range lits { | |
uniq[lit] = struct{}{} | |
} | |
} | |
lits := make([]string, len(uniq)) | |
idx := 0 | |
for lit := range uniq { | |
lits[idx] = lit | |
idx++ | |
} | |
return lits, nil | |
} | |
// expand recursively expands the possible values for a regular expression | |
func expand(pattern *syntax.Regexp) ([]string, error) { | |
switch pattern.Op { | |
case syntax.OpNoMatch: | |
return opNoMatch(pattern), nil | |
case syntax.OpEmptyMatch, syntax.OpBeginLine, syntax.OpEndLine, syntax.OpBeginText, syntax.OpEndText: | |
return opEmpty(pattern), nil | |
case syntax.OpLiteral: | |
return opLiteral(pattern), nil | |
case syntax.OpCharClass: | |
return opCharClass(pattern), nil | |
case syntax.OpAnyCharNotNL: | |
return opAnyCharNotNL(pattern), nil | |
case syntax.OpAnyChar: | |
return opAnyChar(pattern), nil | |
case syntax.OpCapture: | |
return opCapture(pattern) | |
case syntax.OpStar: | |
return opStar(pattern) | |
case syntax.OpPlus: | |
return opPlus(pattern) | |
case syntax.OpQuest: | |
return opQuest(pattern) | |
case syntax.OpRepeat: | |
return opRepeat(pattern) | |
case syntax.OpConcat: | |
return opConcat(pattern) | |
case syntax.OpAlternate: | |
return opAlternate(pattern) | |
default: | |
return nil, fmt.Errorf("unhandled pattern operation %s", pattern.Op) | |
} | |
} | |
// Entry Point | |
func main() { | |
if len(os.Args) < 2 { | |
fmt.Printf("usage: %s <regexp>", os.Args[0]) | |
} | |
exp, err := syntax.Parse(os.Args[1], syntax.POSIX) | |
if err != nil { | |
fmt.Fprintf(os.Stderr, "failed to parse regexp: %s", err) | |
os.Exit(1) | |
} | |
// Operate on the simplified expression as it's faster | |
matches, err := expand(exp.Simplify()) | |
if err != nil { | |
fmt.Fprintf(os.Stderr, "failed to expand regexp: %s", err) | |
os.Exit(1) | |
} | |
for _, match := range matches { | |
fmt.Println(match) | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment