Skip to content

Instantly share code, notes, and snippets.

@savonarola
Last active January 2, 2016 01:59
Show Gist options
  • Save savonarola/8233874 to your computer and use it in GitHub Desktop.
Save savonarola/8233874 to your computer and use it in GitHub Desktop.
#!/usr/bin/perl -w
use strict;
use Time::HiRes qw(time);
my $word_count = 800;
my $word_length = 8;
my $words = [map { random_word($word_length) } 1..$word_count];
my $text = random_word(1000000);
my %nodes = ();
my $tree = [];
my $CHILDREN = 0;
my $SUFFIX = 1;
my $WORD = 2;
my $FINAL = 3;
sub add_child {
my ($node, $word, $letter) = @_;
my $new_node = ($node->[$CHILDREN]->{$letter} ||= [
{},
undef,
$word,
undef,
]);
$nodes{$word} = $new_node;
return $new_node;
}
sub add_word {
my ($word) = @_;
my $cur_subtree = $tree;
my $cur_word = '';
foreach my $letter (split //, $word) {
$cur_word .= $letter;
$cur_subtree = add_child($cur_subtree, $cur_word, $letter);
}
$cur_subtree->[$FINAL] = 1;
}
sub find_longest_suffix {
my ($node) = @_;
my $suffix_node = undef;
my $word = $node->[$WORD];
foreach my $i (1..length($word)-1) {
my $suffix = substr($word, $i, length($word) - $i);
$suffix_node = $nodes{$suffix};
last if $suffix_node;
}
$node->[$SUFFIX] = $suffix_node || $tree;
}
sub find_suffixes {
my ($node) = @_;
find_longest_suffix($node);
foreach my $child_node (values %{$node->[$CHILDREN]}) {
find_suffixes($child_node);
}
}
sub build_tree {
my ($words) = @_;
$tree->[$WORD] = '';
foreach my $word (sort @$words) {
add_word($word);
}
find_suffixes($tree);
$tree->[$SUFFIX] = undef;
}
sub has_word {
my ($text) = @_;
my $cur_node = $tree;
foreach my $i (0..length($text) - 1) {
my $letter = substr($text, $i, 1);
if (my $next = $cur_node->[$CHILDREN]->{$letter}) {
$cur_node = $next;
} else {
my $suffix_node = $cur_node;
$cur_node = $tree;
while($suffix_node = $suffix_node->[$SUFFIX]) {
if(my $child_node = $suffix_node->[$CHILDREN]->{$letter}) {
$cur_node = $child_node;
last;
}
}
}
return 1 if $cur_node->[$FINAL];
}
return 0;
}
sub has_word_simple {
my ($text) = @_;
foreach my $word (@$words) {
return 1 if index($text, $word) >= 0;
}
return 0;
}
sub measure {
my ($label, $sub) = @_;
my $start = time();
my $res = $sub->();
my $end = time();
printf "%s: %.6f\n", $label, $end - $start;
return $res;
}
sub random_word {
my ($length) = @_;
my @letters = ('a' .. 'z');
return join('', map { $letters[rand @letters] } 1..$length);
}
build_tree($words);
measure( has_word => sub {
has_word($text);
});
measure( has_word_simple => sub {
has_word_simple($text);
});
__END__
# output
has_word: 0.795575
has_word_simple: 0.832028
# fuzzy test
use Data::Dumper;
$Data::Dumper::Indent = 1;
$Data::Dumper::Terse = 1;
my $counts = {};
for(1..1000) {
my $sample_text = random_word(1000);
my $has_word = has_word($sample_text);
my $has_word_simple = has_word_simple($sample_text);
die "different result! ".Dumper([$words, $tree, $sample_text, $has_word, $has_word_simple]) unless $has_word == $has_word_simple;
$counts->{$has_word} ++;
}
say Dumper($counts);
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment