Created
February 11, 2022 20:23
-
-
Save the80srobot/e4cd02afd132368c6d99bdd79baad146 to your computer and use it in GitHub Desktop.
Benchmarking some string algorithms
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package rk | |
import ( | |
"regexp" | |
"strings" | |
"testing" | |
"github.com/cloudflare/ahocorasick" | |
) | |
const PrimeRK = 16777619 | |
var ( | |
wordlist1 = []string{ | |
"mozilla", | |
"windows", | |
"firefox", | |
"1234567", | |
"current", | |
"lottery", | |
} | |
wordlist2 = []string{ | |
"pizzazz", | |
"zyzzyva", | |
"fuzzbox", | |
"pizazzy", | |
"jacuzzi", | |
"jazzman", | |
"jazzmen", | |
"jazzbos", | |
"jazzily", | |
"jazzing", | |
"zizzing", | |
"quizzed", | |
"zizzled", | |
"quizzer", | |
"quizzes", | |
"wazzock", | |
"zizzles", | |
"buzzwig", | |
"jazzers", | |
"jazzier", | |
"buzzcut", | |
"muzjiks", | |
"buzzing", | |
"buzzsaw", | |
"fuzzily", | |
"fuzzing", | |
"muzzily", | |
"schizzy", | |
"abbozzo", | |
"chazzan", | |
"chazzen", | |
"fizzily", | |
"fizzing", | |
"frizzly", | |
"jejunum", | |
"jimjams", | |
"jukebox", | |
"jumbuck", | |
"mezuzah", | |
"muzzled", | |
"puzzled", | |
"schnozz", | |
"wazzing", | |
"whizzed", | |
"zhuzhed", | |
"buzzard", | |
"fizzled", | |
"grizzly", | |
"guzzled", | |
"huzzahs", | |
"jujubes", | |
"jujuism", | |
"mizzled", | |
"muezzin", | |
"muzzler", | |
"muzzles", | |
"puzzler", | |
"puzzles", | |
"whizzer", | |
"whizzes", | |
"zhuzhes", | |
"bezique", | |
"bezzant", | |
"buzzers", | |
"buzzier", | |
"cazique", | |
"dizzily", | |
"drizzly", | |
"fizzles", | |
"frazzle", | |
"frizzed", | |
"frizzle", | |
"fuzzier", | |
"guzzler", | |
"guzzles", | |
"huzzaed", | |
"jezebel", | |
"jumpoff", | |
"karezza", | |
"mazzard", | |
"mezquit", | |
"mezuzas", | |
"mezuzot", | |
"mizzens", | |
"mizzles", | |
"muzzier", | |
"nuzzled", | |
"palazzi", | |
"palazzo", | |
"pizzles", | |
"sizzurp", | |
"skyjack", | |
"swizzle", | |
"twizzle", | |
"wizzens", | |
"zigzags", | |
"bezzies", | |
"bizzies", | |
"cozzies", | |
"dazzled", | |
"fizzers", | |
"fizzier", | |
"frizzer", | |
"frizzes", | |
"gizzard", | |
"grizzle", | |
"hazzans", | |
"jejunal", | |
"jojobas", | |
"jonquil", | |
"jujutsu", | |
"mozzies", | |
"nuzzler", | |
"nuzzles", | |
"piazzas", | |
"pozzies", | |
"prezzie", | |
"razzing", | |
"sezzing", | |
"shizzle", | |
"spazzes", | |
"squeezy", | |
"swizzes", | |
"azulejo", | |
"dazzler", | |
"dazzles", | |
"dizzied", | |
"drizzle", | |
"jackdaw", | |
"jinxing", | |
"jujitsu", | |
"jujuist", | |
"jumpcut", | |
"kibbutz", | |
"kolkhoz", | |
"kolkozy", | |
"lockjaw", | |
"nozzles", | |
"packwax", | |
"quetzal", | |
"quezals", | |
"quickly", | |
"sizzled", | |
"sozzled", | |
"ziplock", | |
"zombify", | |
"blowjob", | |
"carjack", | |
"coxcomb", | |
"dizzier", | |
"dizzies", | |
"exequys", | |
"izzards", | |
"jacking", | |
"jackleg", | |
"jackpot", | |
"jaywalk", | |
"jezails", | |
"jobwork", | |
"jumpily", | |
"jumping", | |
"junkman", | |
"junkmen", | |
"kibbitz", | |
"muzhiks", | |
"ozonize", | |
"sizzler", | |
"sizzles", | |
"sjambok", | |
"sovkhoz", | |
"squawky", | |
"squeeze", | |
"zardozi", | |
"zebecks", | |
"zinkify", | |
"zymurgy", | |
"azotize", | |
"buzukia", | |
"buzukis", | |
"chutzpa", | |
"cyclize", | |
"djibbah", | |
"enzymic", | |
"equinox", | |
"flummox", | |
"gazumps", | |
"hijacks", | |
"jabbing", | |
"jaloppy", | |
"jambing", | |
"jamming", | |
"jibbing", | |
"jibboom", | |
"jimminy", | |
"jobbing", | |
"jockeys", | |
"jockish", | |
"jubbahs", | |
"jugulum", | |
"jukskei", | |
} | |
text1 = "Mozilla Firefox, or simply Firefox, is a free and open-source[19] web browser developed by the Mozilla Foundation and its subsidiary, the Mozilla Corporation. Firefox uses the Gecko rendering engine to display web pages, which implements current and anticipated web standards" | |
text2 = ` | |
zyzzyva | |
zymurgy | |
zombify | |
zizzles | |
zizzled | |
zizzing | |
ziplock | |
zinkify | |
zigzags | |
zhuzhes | |
zhuzhed | |
zebecks | |
zardozi | |
wizzens | |
whizzes | |
whizzer | |
whizzed | |
wazzock | |
wazzing | |
twizzle | |
swizzle | |
swizzes | |
squeezy | |
squeeze | |
squawky | |
spazzes | |
sozzled | |
sovkhoz | |
skyjack | |
sjambok | |
sizzurp | |
sizzles | |
sizzler | |
sizzled | |
shizzle | |
sezzing | |
schnozz | |
schizzy | |
razzing | |
quizzes | |
quizzer | |
quizzed | |
quickly | |
quezals | |
quetzal | |
puzzles | |
puzzler | |
puzzled | |
prezzie | |
pozzies | |
pizzles | |
pizzazz | |
pizazzy | |
piazzas | |
palazzo | |
palazzi | |
packwax | |
ozonize | |
nuzzles | |
nuzzler | |
nuzzled | |
nozzles | |
muzzles | |
muzzler | |
muzzled | |
muzzily | |
muzzier | |
muzjiks | |
muzhiks | |
muezzin | |
mozzies | |
mizzles | |
mizzled | |
mizzens | |
mezuzot | |
mezuzas | |
mezuzah | |
mezquit | |
mazzard | |
lockjaw | |
kolkozy | |
kolkhoz | |
kibbutz | |
kibbitz | |
karezza | |
junkmen | |
junkman | |
jumpoff | |
jumping | |
jumpily | |
jumpcut | |
jumbuck | |
jukskei | |
jukebox | |
jujutsu | |
jujuist | |
jujuism | |
jujubes | |
jujitsu | |
jugulum | |
jubbahs | |
jonquil | |
jojobas | |
jockish | |
jockeys | |
jobwork | |
jobbing | |
jinxing | |
jimminy | |
jimjams | |
jibboom | |
jibbing | |
jezebel | |
jezails | |
jejunum | |
jejunal | |
jazzmen | |
jazzman | |
jazzing | |
jazzily | |
jazzier | |
jazzers | |
jazzbos | |
jaywalk | |
jamming | |
jambing | |
jaloppy | |
jacuzzi | |
jackpot | |
jackleg | |
jacking | |
jackdaw | |
jabbing | |
izzards | |
huzzahs | |
huzzaed | |
hijacks | |
hazzans | |
guzzles | |
guzzler | |
guzzled | |
grizzly | |
grizzle | |
gizzard | |
gazumps | |
fuzzing | |
fuzzily | |
fuzzier | |
fuzzbox | |
frizzly | |
frizzle | |
frizzes | |
frizzer | |
frizzed | |
frazzle | |
flummox | |
fizzles | |
fizzled | |
fizzing | |
fizzily | |
fizzier | |
fizzers | |
exequys | |
equinox | |
enzymic | |
drizzly | |
drizzle | |
djibbah | |
dizzily | |
dizzies | |
dizzier | |
dizzied | |
dazzles | |
dazzler | |
dazzled | |
cyclize | |
cozzies | |
coxcomb | |
chutzpa | |
chazzen | |
chazzan | |
cazique | |
carjack | |
buzzwig | |
buzzsaw | |
buzzing | |
buzzier | |
buzzers | |
buzzcut | |
buzzard | |
buzukis | |
buzukia | |
blowjob | |
bizzies | |
bezzies | |
bezzant | |
bezique | |
azulejo | |
azotize | |
abbozzo | |
` | |
) | |
func HashStr(sep string) (uint32, uint32) { | |
hash := uint32(0) | |
for i := 0; i < len(sep); i++ { | |
hash = hash*PrimeRK + uint32(sep[i]) | |
} | |
var pow, sq uint32 = 1, PrimeRK | |
for i := len(sep); i > 0; i >>= 1 { | |
if i&1 != 0 { | |
pow *= sq | |
} | |
sq *= sq | |
} | |
return hash, pow | |
} | |
func RabinKarpBake(needles []string) (set map[string]struct{}, hset map[uint32]struct{}, pow uint32) { | |
hset = make(map[uint32]struct{}, len(needles)) | |
set = make(map[string]struct{}, len(needles)) | |
for _, s := range needles { | |
var h uint32 | |
h, pow = HashStr(s) | |
hset[h] = struct{}{} | |
set[s] = struct{}{} | |
} | |
return set, hset, pow | |
} | |
func IndexRabinKarpMultiple(s string, needles map[string]struct{}, hset map[uint32]struct{}, n int, pow uint32) int { | |
// Rabin-Karp search | |
var h uint32 | |
for i := 0; i < n; i++ { | |
h = h*PrimeRK + uint32(s[i]) | |
} | |
if _, ok := hset[h]; ok { | |
if _, ok := needles[s[:n]]; ok { | |
return 0 | |
} | |
} | |
for i := n; i < len(s); { | |
h *= PrimeRK | |
h += uint32(s[i]) | |
h -= pow * uint32(s[i-n]) | |
i++ | |
if _, ok := hset[h]; ok { | |
if _, ok := needles[s[i-n:i]]; ok { | |
return i - n | |
} | |
} | |
} | |
return -1 | |
} | |
func naiveIndexAnySubstring(s string, needles []string) int { | |
res := 0 | |
for _, needle := range needles { | |
idx := strings.Index(s, needle) | |
if idx >= 0 && idx < res { | |
res = idx | |
} | |
} | |
return res | |
} | |
func benchmarkRK(b *testing.B, s string, needles []string) { | |
set, hset, pow := RabinKarpBake(needles) | |
n := len(needles[0]) | |
b.ResetTimer() | |
for i := 0; i < b.N; i++ { | |
IndexRabinKarpMultiple(s, set, hset, n, pow) | |
} | |
} | |
func benchmarkNaiveSearch(b *testing.B, s string, needles []string) { | |
for i := 0; i < b.N; i++ { | |
naiveIndexAnySubstring(s, needles) | |
} | |
} | |
func benchmarkRegexSearch(b *testing.B, s string, needles []string) { | |
re := regexp.MustCompile(strings.Join(needles, "|")) | |
b.ResetTimer() | |
for i := 0; i < b.N; i++ { | |
re.FindStringIndex(s) | |
} | |
} | |
func benchmarkCFAhoCorasick(b *testing.B, s string, needles []string) { | |
m := ahocorasick.NewStringMatcher(needles) | |
b.ResetTimer() | |
for i := 0; i < b.N; i++ { | |
m.Match([]byte(s)) | |
} | |
} | |
func BenchmarkRKOrganic(b *testing.B) { | |
benchmarkRK(b, text1, wordlist1) | |
} | |
func BenchmarkNaiveOrganic(b *testing.B) { | |
benchmarkNaiveSearch(b, text1, wordlist1) | |
} | |
func BenchmarkRegexOrganic(b *testing.B) { | |
benchmarkRegexSearch(b, text1, wordlist1) | |
} | |
func BenchmarkACOrganic(b *testing.B) { | |
benchmarkCFAhoCorasick(b, text1, wordlist1) | |
} | |
func BenchmarkRKTargetRich(b *testing.B) { | |
benchmarkRK(b, text2, wordlist2) | |
} | |
func BenchmarkNaiveTargetRich(b *testing.B) { | |
benchmarkNaiveSearch(b, text2, wordlist2) | |
} | |
func BenchmarkRegexTargetRich(b *testing.B) { | |
benchmarkRegexSearch(b, text2, wordlist2) | |
} | |
func BenchmarkACTargetRich(b *testing.B) { | |
benchmarkCFAhoCorasick(b, text2, wordlist2) | |
} | |
func BenchmarkRKManyNeedlesNoMatch(b *testing.B) { | |
benchmarkRK(b, text1, wordlist2) | |
} | |
func BenchmarkNaiveManyNeedlesNoMatch(b *testing.B) { | |
benchmarkNaiveSearch(b, text1, wordlist2) | |
} | |
func BenchmarkRegexManyNeedlesNoMatch(b *testing.B) { | |
benchmarkRegexSearch(b, text1, wordlist2) | |
} | |
func BenchmarkACManyNeedlesNoMatch(b *testing.B) { | |
benchmarkCFAhoCorasick(b, text1, wordlist2) | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment