Last active
August 31, 2020 02:20
-
-
Save moritz/af5f3cfcd497ab4d51baf4045c1060b9 to your computer and use it in GitHub Desktop.
Generate shortest strings that match a regex
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
use v6; | |
role Generator { | |
method shortest() { ... } | |
} | |
class Literal does Generator { | |
has Str $.atom is required; | |
method shortest() { return $.atom.list } | |
} | |
class CharClass does Generator { | |
has @.options is required; | |
method shortest() { return @.options.list } | |
} | |
class Concat does Generator { | |
has @.atoms is required; | |
method shortest() { | |
my @generators = @.atoms.map: *.shortest(); | |
return list cross(@generators.map(*.list), | |
with => -> *@b { @b.join }).sort(*.chars); | |
} | |
} | |
class Alternative does Generator { | |
has @.branches is required; | |
method shortest() { | |
my @generators = @.branches.map: *.shortest(); | |
my @indexes = 0 xx @generators; | |
my @current = @generators.map: *[0]; | |
list gather while defined(any @current) { | |
my $min-idx = @current.pairs.min({.value.?chars // ∞}).key; | |
take @current[$min-idx]; | |
# advance the generator at $min-idx | |
given ++@indexes[$min-idx] { | |
@current[$min-idx] = @generators[$min-idx][$_]; | |
} | |
} | |
} | |
}; | |
class Conjunction does Generator { | |
has @.branches is required; | |
method shortest() { | |
my \f = @.branches[0].shortest; | |
return f if @.branches == 1; | |
my \sets = all @.branches[1..*].map({ set .shortest}); | |
return f.grep: -> $s { | |
$s ∈ sets | |
} | |
} | |
} | |
class Quantifier does Generator { | |
has Generator $.atom is required; | |
has Int $.min = 0; | |
has Int $.max = 1; | |
method shortest() { | |
my @gen = $.atom.shortest; | |
my @options; | |
my @res = gather { | |
take '' if $.min == 0; | |
if $.min <= 1 { | |
take $_ for @gen | |
} | |
for ($.min max 2)..$.max -> $rep { | |
my @options = @gen xx $rep; | |
take $_ for cross(|@options.map(*.list), | |
with => -> *@b { @b.join }) | |
} | |
} | |
return @res.sort(*.chars); | |
} | |
} | |
my $cc = CharClass.new( options => <a b c> ); | |
sub alt(*@options) { | |
Alternative.new( | |
branches => @options.map({ Literal.new( atom => $_ ) }), | |
) | |
} | |
my $alt2 = alt('ab', 'x'); | |
my $alt3 = alt(<ab x 123>); | |
my $alt-cc = Alternative.new( | |
branches => [ | |
Literal.new(atom => 'xy'), | |
$cc, | |
Literal.new(atom => '123'), | |
], | |
); | |
my $concat-simple = Concat.new( | |
atoms => [ | |
Literal.new(atom => 'xy'), | |
Literal.new(atom => 'Z'), | |
Literal.new(atom => 'a'), | |
], | |
); | |
my $concat-cc-alt = Concat.new( # <[abc]> X [12|3] | |
atoms => [ | |
CharClass.new( options => <a b c> ), | |
Literal.new(atom => 'X'), | |
Alternative.new(branches => [ | |
Literal.new(atom => '12'), | |
Literal.new(atom => '3'), | |
], | |
) | |
], | |
); | |
my $quant-trivial = Quantifier.new( | |
min => 0, | |
max => 3, | |
atom => Literal.new(atom => 'a'), | |
); | |
my $quant-alt = Quantifier.new( | |
min => 0, | |
max => 3, | |
atom => alt(<a xy>), | |
); | |
my $conj = Conjunction.new( | |
branches => [ | |
alt(<a aaa bbb>), | |
alt(<a aa aaa aaaa bbb>), | |
alt(<a bbb>), | |
], | |
); | |
grammar BasicRegex { | |
token literal { \w+ } | |
rule charclass { '<[' (\w+) ']>' } | |
token basic { <literal> | <charclass> | <grouped> } | |
rule grouped { | |
'[' ~ ']' <expression> | |
} | |
rule quantifier { | |
| ('?' ) | |
| ('+' ) | |
| ('*' ) | |
| ('**') $<min>=[\d+] '..' $<max>=[\d+] | |
} | |
rule quantified { | |
<basic> <quantifier>? | |
} | |
rule conjunction { | |
<quantified> + % '&' | |
} | |
rule alternative { | |
<conjunction> + % '|' | |
} | |
rule expression { | |
<alternative> + | |
} | |
rule TOP { <.ws> <expression> } | |
} | |
constant MAXINT = 42; | |
class BasicRegexActions { | |
method literal($/) { make Literal.new( atom => ~$/ ) } | |
method charclass($/) { make CharClass.new( options => $0.Str.comb ) } | |
method basic($/) { make $/.hash.values[0].ast } | |
method grouped($/) { make $<expression>.ast } | |
method quantifier($/) { | |
my %map = | |
'?' => { :min(0), :max(1) }, | |
'+' => { :min(1), :max(MAXINT) }, | |
'*' => { :min(0), :max(MAXINT) }, | |
; | |
make %map{ ~$0 } // { :min(+$<min>), :max(+$<max>) }; | |
} | |
method quantified($/) { | |
make $<quantifier> | |
?? Quantifier.new( | |
min => $<quantifier>.ast<min>, | |
max => $<quantifier>.ast<max>, | |
atom => $<basic>.ast, | |
) | |
!! $<basic>.ast; | |
} | |
method conjunction($/) { | |
make do given $<quantified> { | |
.elems == 1 | |
?? .[0].ast | |
!! Conjunction.new( branches => .map(*.ast) ); | |
} | |
} | |
method alternative($/) { | |
make do given $<conjunction> { | |
.elems == 1 | |
?? .[0].ast | |
!! Alternative.new( branches => .map(*.ast) ); | |
} | |
} | |
method expression($/) { | |
make do given $<alternative> { | |
.elems == 1 | |
?? .[0].ast | |
!! Concat.new( atoms => .map(*.ast) ); | |
} | |
} | |
method TOP($/) { make $<expression>.ast } | |
} | |
sub parse(Str() $in) { | |
my $match = BasicRegex.parse($in, :actions(BasicRegexActions.new)) | |
or die "Not a valid regex: $in"; | |
return $match.ast; | |
} | |
use Test; | |
is $cc.shortest, <a b c>, 'CharClass.shortest'; | |
is $alt2.shortest, <x ab>, 'Alternative.shortest (2)'; | |
is $alt3.shortest, <x ab 123>, 'Alternative.shortest (3)'; | |
is $alt-cc.shortest, <a b c xy 123>, 'Alternative + Char Class shortest'; | |
is $concat-simple.shortest, list('xyZa'), 'simple concat'; | |
is $concat-cc-alt.shortest, <aX3 bX3 cX3 aX12 bX12 cX12>, 'Concat with charclass and alternative'; | |
is $quant-trivial.shortest.join('|'), '|a|aa|aaa', 'trivial quantifier'; | |
is $quant-alt.shortest.join('|'), | |
'|a|xy|aa|axy|xya|aaa|xyxy|aaxy|axya|xyaa|axyxy|xyaxy|xyxya|xyxyxy', | |
'Quantified alternative'; | |
is $conj.shortest, <a bbb>, 'Conjunction'; | |
ok BasicRegex.parse('abc', :rule<literal>), 'parse literal'; | |
ok BasicRegex.parse('<[abc]>', :rule<charclass>), 'parse charclass'; | |
ok BasicRegex.parse('<[abc]>', :rule<quantified>), 'parse quantified (degenerate case)'; | |
ok BasicRegex.parse('<[abc]>+', :rule<quantified>), 'parse quantified'; | |
ok BasicRegex.parse('<[abc]>+', :rule<conjunction>), 'parse conjunction (degenerate case)'; | |
ok BasicRegex.parse('<[abc]>+ & b', :rule<conjunction>), 'parse conjunction'; | |
ok BasicRegex.parse('<[æbc]>+ | xzy', :rule<alternative>), 'Alternative'; | |
ok BasicRegex.parse('abc+ def'), 'Multiple expressions in a row'; | |
ok BasicRegex.parse('[abc <[def]>]+ | 12&12*'), 'Complex regex'; | |
my $re = parse('[<[ab]> ** 1..5 | xy] foo'); | |
is $re.shortest[0], 'afoo', 'Parse complex regex'; | |
is $re.shortest.head(10), <afoo bfoo aafoo abfoo bafoo bbfoo xyfoo aaafoo aabfoo abafoo>, | |
'Generate from parsed regex'; | |
done-testing; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment