Skip to content

Instantly share code, notes, and snippets.

@bored-engineer
Created May 19, 2020 00:30
Show Gist options
  • Save bored-engineer/89e7f9810e9f21bcb72ac7df5dd8e4a1 to your computer and use it in GitHub Desktop.
Save bored-engineer/89e7f9810e9f21bcb72ac7df5dd8e4a1 to your computer and use it in GitHub Desktop.
A go program to expand a regular expression into every possible matching string
package main
import (
"os"
"fmt"
"strings"
"regexp/syntax"
"unicode/utf8"
)
// Maximum number of repeats if unterminated (ex: *, +)
var MaxRepeats = 10
// syntax.OpNoMatch
func opNoMatch(pattern *syntax.Regexp) []string {
return nil
}
// syntax.OpEmptyMatch, syntax.OpBeginLine, syntax.OpEndLine, syntax.OpBeginText, syntax.OpEndText
func opEmpty(pattern *syntax.Regexp) []string {
return []string{""}
}
// syntax.OpLiteral
func opLiteral(pattern *syntax.Regexp) []string {
// Literal is returned as-is, just 1 value
return []string{string(pattern.Rune)}
}
// syntax.OpCharClass
func opCharClass(pattern *syntax.Regexp) []string {
// pattern.Rune is already de-dupped
var lits []string
for idx := 0; idx < len(pattern.Rune); idx += 2 {
for r := pattern.Rune[idx]; r <= pattern.Rune[idx+1]; r++ {
lits = append(lits, string(r))
}
}
return lits
}
// syntax.OpAnyCharNotNL
func opAnyCharNotNL(pattern *syntax.Regexp) []string {
lits := make([]string, utf8.MaxRune - 1)
idx := 0
for r := 0; r < utf8.MaxRune; idx++ {
if rune(r) != rune('\n') {
lits[idx] = string(rune(idx))
idx++
}
}
return lits
}
// syntax.OpAnyChar
func opAnyChar(pattern *syntax.Regexp) []string {
lits := make([]string, utf8.MaxRune)
for idx := 0; idx < int(utf8.MaxRune); idx++ {
lits[idx] = string(rune(idx))
}
return lits
}
// syntax.OpCapture
func opCapture(pattern *syntax.Regexp) ([]string, error) {
return expand(pattern.Sub[0])
}
// syntax.OpStar
func opStar(pattern *syntax.Regexp) ([]string, error) {
pattern.Min = 0
pattern.Max = MaxRepeats
return opRepeat(pattern)
}
// syntax.OpPlus
func opPlus(pattern *syntax.Regexp) ([]string, error) {
pattern.Min = 1
pattern.Max = MaxRepeats
return opRepeat(pattern)
}
// syntax.OpQuest
func opQuest(pattern *syntax.Regexp) ([]string, error) {
lits, err := expand(pattern.Sub[0])
if err != nil {
return nil, err
}
return append(lits, ""), nil
}
// syntax.OpRepeat
func opRepeat(pattern *syntax.Regexp) ([]string, error) {
lits, err := expand(pattern.Sub[0])
if err != nil {
return nil, err
}
if pattern.Max == -1 {
pattern.Max = MaxRepeats
}
rlits := make([]string, len(lits) * (pattern.Max - pattern.Min))
idx := 0
for repeat := pattern.Min; repeat < pattern.Max; repeat++ {
if repeat == 0 {
rlits[idx] = ""
idx++
rlits = rlits[:len(rlits) - len(lits) + 1]
continue
}
for _, lit := range lits {
rlits[idx] = strings.Repeat(lit, repeat)
idx++
}
}
return rlits, nil
}
// syntax.OpConcat
func opConcat(pattern *syntax.Regexp) ([]string, error) {
// Ensure unique values
uniq := make(map[string]struct{})
// recursively concat
var recur func(prefixes []string, patterns []*syntax.Regexp) error
recur = func(prefixes []string, patterns []*syntax.Regexp) error {
lits, err := expand(patterns[0])
if err != nil {
return err
}
swap := make([]string, len(lits) * len(prefixes))
idx := 0
for _, prefix := range prefixes {
for _, lit := range lits {
swap[idx] = prefix + lit
idx++
}
}
if len(patterns) <= 1 {
for _, lit := range swap {
uniq[lit] = struct{}{}
}
return nil
} else {
return recur(swap, patterns[1:])
}
}
if err := recur([]string{""}, pattern.Sub); err != nil {
return nil, err
}
lits := make([]string, len(uniq))
idx := 0
for lit := range uniq {
lits[idx] = lit
idx++
}
return lits, nil
}
// syntax.OpAlternate
func opAlternate(pattern *syntax.Regexp) ([]string, error) {
// Ensure unique values
uniq := make(map[string]struct{})
for _, sub := range pattern.Sub {
// Each alternate stands on it's own
lits, err := expand(sub)
if err != nil {
return nil, err
}
for _, lit := range lits {
uniq[lit] = struct{}{}
}
}
lits := make([]string, len(uniq))
idx := 0
for lit := range uniq {
lits[idx] = lit
idx++
}
return lits, nil
}
// expand recursively expands the possible values for a regular expression
func expand(pattern *syntax.Regexp) ([]string, error) {
switch pattern.Op {
case syntax.OpNoMatch:
return opNoMatch(pattern), nil
case syntax.OpEmptyMatch, syntax.OpBeginLine, syntax.OpEndLine, syntax.OpBeginText, syntax.OpEndText:
return opEmpty(pattern), nil
case syntax.OpLiteral:
return opLiteral(pattern), nil
case syntax.OpCharClass:
return opCharClass(pattern), nil
case syntax.OpAnyCharNotNL:
return opAnyCharNotNL(pattern), nil
case syntax.OpAnyChar:
return opAnyChar(pattern), nil
case syntax.OpCapture:
return opCapture(pattern)
case syntax.OpStar:
return opStar(pattern)
case syntax.OpPlus:
return opPlus(pattern)
case syntax.OpQuest:
return opQuest(pattern)
case syntax.OpRepeat:
return opRepeat(pattern)
case syntax.OpConcat:
return opConcat(pattern)
case syntax.OpAlternate:
return opAlternate(pattern)
default:
return nil, fmt.Errorf("unhandled pattern operation %s", pattern.Op)
}
}
// Entry Point
func main() {
if len(os.Args) < 2 {
fmt.Printf("usage: %s <regexp>", os.Args[0])
}
exp, err := syntax.Parse(os.Args[1], syntax.POSIX)
if err != nil {
fmt.Fprintf(os.Stderr, "failed to parse regexp: %s", err)
os.Exit(1)
}
// Operate on the simplified expression as it's faster
matches, err := expand(exp.Simplify())
if err != nil {
fmt.Fprintf(os.Stderr, "failed to expand regexp: %s", err)
os.Exit(1)
}
for _, match := range matches {
fmt.Println(match)
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment