Skip to content

Instantly share code, notes, and snippets.

@moritz
Last active August 31, 2020 02:20
Show Gist options
  • Save moritz/af5f3cfcd497ab4d51baf4045c1060b9 to your computer and use it in GitHub Desktop.
Save moritz/af5f3cfcd497ab4d51baf4045c1060b9 to your computer and use it in GitHub Desktop.
Generate shortest strings that match a regex
use v6;
role Generator {
method shortest() { ... }
}
class Literal does Generator {
has Str $.atom is required;
method shortest() { return $.atom.list }
}
class CharClass does Generator {
has @.options is required;
method shortest() { return @.options.list }
}
class Concat does Generator {
has @.atoms is required;
method shortest() {
my @generators = @.atoms.map: *.shortest();
return list cross(@generators.map(*.list),
with => -> *@b { @b.join }).sort(*.chars);
}
}
class Alternative does Generator {
has @.branches is required;
method shortest() {
my @generators = @.branches.map: *.shortest();
my @indexes = 0 xx @generators;
my @current = @generators.map: *[0];
list gather while defined(any @current) {
my $min-idx = @current.pairs.min({.value.?chars // ∞}).key;
take @current[$min-idx];
# advance the generator at $min-idx
given ++@indexes[$min-idx] {
@current[$min-idx] = @generators[$min-idx][$_];
}
}
}
};
class Conjunction does Generator {
has @.branches is required;
method shortest() {
my \f = @.branches[0].shortest;
return f if @.branches == 1;
my \sets = all @.branches[1..*].map({ set .shortest});
return f.grep: -> $s {
$s ∈ sets
}
}
}
class Quantifier does Generator {
has Generator $.atom is required;
has Int $.min = 0;
has Int $.max = 1;
method shortest() {
my @gen = $.atom.shortest;
my @options;
my @res = gather {
take '' if $.min == 0;
if $.min <= 1 {
take $_ for @gen
}
for ($.min max 2)..$.max -> $rep {
my @options = @gen xx $rep;
take $_ for cross(|@options.map(*.list),
with => -> *@b { @b.join })
}
}
return @res.sort(*.chars);
}
}
my $cc = CharClass.new( options => <a b c> );
sub alt(*@options) {
Alternative.new(
branches => @options.map({ Literal.new( atom => $_ ) }),
)
}
my $alt2 = alt('ab', 'x');
my $alt3 = alt(<ab x 123>);
my $alt-cc = Alternative.new(
branches => [
Literal.new(atom => 'xy'),
$cc,
Literal.new(atom => '123'),
],
);
my $concat-simple = Concat.new(
atoms => [
Literal.new(atom => 'xy'),
Literal.new(atom => 'Z'),
Literal.new(atom => 'a'),
],
);
my $concat-cc-alt = Concat.new( # <[abc]> X [12|3]
atoms => [
CharClass.new( options => <a b c> ),
Literal.new(atom => 'X'),
Alternative.new(branches => [
Literal.new(atom => '12'),
Literal.new(atom => '3'),
],
)
],
);
my $quant-trivial = Quantifier.new(
min => 0,
max => 3,
atom => Literal.new(atom => 'a'),
);
my $quant-alt = Quantifier.new(
min => 0,
max => 3,
atom => alt(<a xy>),
);
my $conj = Conjunction.new(
branches => [
alt(<a aaa bbb>),
alt(<a aa aaa aaaa bbb>),
alt(<a bbb>),
],
);
grammar BasicRegex {
token literal { \w+ }
rule charclass { '<[' (\w+) ']>' }
token basic { <literal> | <charclass> | <grouped> }
rule grouped {
'[' ~ ']' <expression>
}
rule quantifier {
| ('?' )
| ('+' )
| ('*' )
| ('**') $<min>=[\d+] '..' $<max>=[\d+]
}
rule quantified {
<basic> <quantifier>?
}
rule conjunction {
<quantified> + % '&'
}
rule alternative {
<conjunction> + % '|'
}
rule expression {
<alternative> +
}
rule TOP { <.ws> <expression> }
}
constant MAXINT = 42;
class BasicRegexActions {
method literal($/) { make Literal.new( atom => ~$/ ) }
method charclass($/) { make CharClass.new( options => $0.Str.comb ) }
method basic($/) { make $/.hash.values[0].ast }
method grouped($/) { make $<expression>.ast }
method quantifier($/) {
my %map =
'?' => { :min(0), :max(1) },
'+' => { :min(1), :max(MAXINT) },
'*' => { :min(0), :max(MAXINT) },
;
make %map{ ~$0 } // { :min(+$<min>), :max(+$<max>) };
}
method quantified($/) {
make $<quantifier>
?? Quantifier.new(
min => $<quantifier>.ast<min>,
max => $<quantifier>.ast<max>,
atom => $<basic>.ast,
)
!! $<basic>.ast;
}
method conjunction($/) {
make do given $<quantified> {
.elems == 1
?? .[0].ast
!! Conjunction.new( branches => .map(*.ast) );
}
}
method alternative($/) {
make do given $<conjunction> {
.elems == 1
?? .[0].ast
!! Alternative.new( branches => .map(*.ast) );
}
}
method expression($/) {
make do given $<alternative> {
.elems == 1
?? .[0].ast
!! Concat.new( atoms => .map(*.ast) );
}
}
method TOP($/) { make $<expression>.ast }
}
sub parse(Str() $in) {
my $match = BasicRegex.parse($in, :actions(BasicRegexActions.new))
or die "Not a valid regex: $in";
return $match.ast;
}
use Test;
is $cc.shortest, <a b c>, 'CharClass.shortest';
is $alt2.shortest, <x ab>, 'Alternative.shortest (2)';
is $alt3.shortest, <x ab 123>, 'Alternative.shortest (3)';
is $alt-cc.shortest, <a b c xy 123>, 'Alternative + Char Class shortest';
is $concat-simple.shortest, list('xyZa'), 'simple concat';
is $concat-cc-alt.shortest, <aX3 bX3 cX3 aX12 bX12 cX12>, 'Concat with charclass and alternative';
is $quant-trivial.shortest.join('|'), '|a|aa|aaa', 'trivial quantifier';
is $quant-alt.shortest.join('|'),
'|a|xy|aa|axy|xya|aaa|xyxy|aaxy|axya|xyaa|axyxy|xyaxy|xyxya|xyxyxy',
'Quantified alternative';
is $conj.shortest, <a bbb>, 'Conjunction';
ok BasicRegex.parse('abc', :rule<literal>), 'parse literal';
ok BasicRegex.parse('<[abc]>', :rule<charclass>), 'parse charclass';
ok BasicRegex.parse('<[abc]>', :rule<quantified>), 'parse quantified (degenerate case)';
ok BasicRegex.parse('<[abc]>+', :rule<quantified>), 'parse quantified';
ok BasicRegex.parse('<[abc]>+', :rule<conjunction>), 'parse conjunction (degenerate case)';
ok BasicRegex.parse('<[abc]>+ & b', :rule<conjunction>), 'parse conjunction';
ok BasicRegex.parse('<[æbc]>+ | xzy', :rule<alternative>), 'Alternative';
ok BasicRegex.parse('abc+ def'), 'Multiple expressions in a row';
ok BasicRegex.parse('[abc <[def]>]+ | 12&12*'), 'Complex regex';
my $re = parse('[<[ab]> ** 1..5 | xy] foo');
is $re.shortest[0], 'afoo', 'Parse complex regex';
is $re.shortest.head(10), <afoo bfoo aafoo abfoo bafoo bbfoo xyfoo aaafoo aabfoo abafoo>,
'Generate from parsed regex';
done-testing;
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment