Skip to content

Instantly share code, notes, and snippets.

@the80srobot
Created February 11, 2022 20:23
Show Gist options
  • Save the80srobot/e4cd02afd132368c6d99bdd79baad146 to your computer and use it in GitHub Desktop.
Save the80srobot/e4cd02afd132368c6d99bdd79baad146 to your computer and use it in GitHub Desktop.
Benchmarking some string algorithms
package rk
import (
"regexp"
"strings"
"testing"
"github.com/cloudflare/ahocorasick"
)
const PrimeRK = 16777619
var (
wordlist1 = []string{
"mozilla",
"windows",
"firefox",
"1234567",
"current",
"lottery",
}
wordlist2 = []string{
"pizzazz",
"zyzzyva",
"fuzzbox",
"pizazzy",
"jacuzzi",
"jazzman",
"jazzmen",
"jazzbos",
"jazzily",
"jazzing",
"zizzing",
"quizzed",
"zizzled",
"quizzer",
"quizzes",
"wazzock",
"zizzles",
"buzzwig",
"jazzers",
"jazzier",
"buzzcut",
"muzjiks",
"buzzing",
"buzzsaw",
"fuzzily",
"fuzzing",
"muzzily",
"schizzy",
"abbozzo",
"chazzan",
"chazzen",
"fizzily",
"fizzing",
"frizzly",
"jejunum",
"jimjams",
"jukebox",
"jumbuck",
"mezuzah",
"muzzled",
"puzzled",
"schnozz",
"wazzing",
"whizzed",
"zhuzhed",
"buzzard",
"fizzled",
"grizzly",
"guzzled",
"huzzahs",
"jujubes",
"jujuism",
"mizzled",
"muezzin",
"muzzler",
"muzzles",
"puzzler",
"puzzles",
"whizzer",
"whizzes",
"zhuzhes",
"bezique",
"bezzant",
"buzzers",
"buzzier",
"cazique",
"dizzily",
"drizzly",
"fizzles",
"frazzle",
"frizzed",
"frizzle",
"fuzzier",
"guzzler",
"guzzles",
"huzzaed",
"jezebel",
"jumpoff",
"karezza",
"mazzard",
"mezquit",
"mezuzas",
"mezuzot",
"mizzens",
"mizzles",
"muzzier",
"nuzzled",
"palazzi",
"palazzo",
"pizzles",
"sizzurp",
"skyjack",
"swizzle",
"twizzle",
"wizzens",
"zigzags",
"bezzies",
"bizzies",
"cozzies",
"dazzled",
"fizzers",
"fizzier",
"frizzer",
"frizzes",
"gizzard",
"grizzle",
"hazzans",
"jejunal",
"jojobas",
"jonquil",
"jujutsu",
"mozzies",
"nuzzler",
"nuzzles",
"piazzas",
"pozzies",
"prezzie",
"razzing",
"sezzing",
"shizzle",
"spazzes",
"squeezy",
"swizzes",
"azulejo",
"dazzler",
"dazzles",
"dizzied",
"drizzle",
"jackdaw",
"jinxing",
"jujitsu",
"jujuist",
"jumpcut",
"kibbutz",
"kolkhoz",
"kolkozy",
"lockjaw",
"nozzles",
"packwax",
"quetzal",
"quezals",
"quickly",
"sizzled",
"sozzled",
"ziplock",
"zombify",
"blowjob",
"carjack",
"coxcomb",
"dizzier",
"dizzies",
"exequys",
"izzards",
"jacking",
"jackleg",
"jackpot",
"jaywalk",
"jezails",
"jobwork",
"jumpily",
"jumping",
"junkman",
"junkmen",
"kibbitz",
"muzhiks",
"ozonize",
"sizzler",
"sizzles",
"sjambok",
"sovkhoz",
"squawky",
"squeeze",
"zardozi",
"zebecks",
"zinkify",
"zymurgy",
"azotize",
"buzukia",
"buzukis",
"chutzpa",
"cyclize",
"djibbah",
"enzymic",
"equinox",
"flummox",
"gazumps",
"hijacks",
"jabbing",
"jaloppy",
"jambing",
"jamming",
"jibbing",
"jibboom",
"jimminy",
"jobbing",
"jockeys",
"jockish",
"jubbahs",
"jugulum",
"jukskei",
}
text1 = "Mozilla Firefox, or simply Firefox, is a free and open-source[19] web browser developed by the Mozilla Foundation and its subsidiary, the Mozilla Corporation. Firefox uses the Gecko rendering engine to display web pages, which implements current and anticipated web standards"
text2 = `
zyzzyva
zymurgy
zombify
zizzles
zizzled
zizzing
ziplock
zinkify
zigzags
zhuzhes
zhuzhed
zebecks
zardozi
wizzens
whizzes
whizzer
whizzed
wazzock
wazzing
twizzle
swizzle
swizzes
squeezy
squeeze
squawky
spazzes
sozzled
sovkhoz
skyjack
sjambok
sizzurp
sizzles
sizzler
sizzled
shizzle
sezzing
schnozz
schizzy
razzing
quizzes
quizzer
quizzed
quickly
quezals
quetzal
puzzles
puzzler
puzzled
prezzie
pozzies
pizzles
pizzazz
pizazzy
piazzas
palazzo
palazzi
packwax
ozonize
nuzzles
nuzzler
nuzzled
nozzles
muzzles
muzzler
muzzled
muzzily
muzzier
muzjiks
muzhiks
muezzin
mozzies
mizzles
mizzled
mizzens
mezuzot
mezuzas
mezuzah
mezquit
mazzard
lockjaw
kolkozy
kolkhoz
kibbutz
kibbitz
karezza
junkmen
junkman
jumpoff
jumping
jumpily
jumpcut
jumbuck
jukskei
jukebox
jujutsu
jujuist
jujuism
jujubes
jujitsu
jugulum
jubbahs
jonquil
jojobas
jockish
jockeys
jobwork
jobbing
jinxing
jimminy
jimjams
jibboom
jibbing
jezebel
jezails
jejunum
jejunal
jazzmen
jazzman
jazzing
jazzily
jazzier
jazzers
jazzbos
jaywalk
jamming
jambing
jaloppy
jacuzzi
jackpot
jackleg
jacking
jackdaw
jabbing
izzards
huzzahs
huzzaed
hijacks
hazzans
guzzles
guzzler
guzzled
grizzly
grizzle
gizzard
gazumps
fuzzing
fuzzily
fuzzier
fuzzbox
frizzly
frizzle
frizzes
frizzer
frizzed
frazzle
flummox
fizzles
fizzled
fizzing
fizzily
fizzier
fizzers
exequys
equinox
enzymic
drizzly
drizzle
djibbah
dizzily
dizzies
dizzier
dizzied
dazzles
dazzler
dazzled
cyclize
cozzies
coxcomb
chutzpa
chazzen
chazzan
cazique
carjack
buzzwig
buzzsaw
buzzing
buzzier
buzzers
buzzcut
buzzard
buzukis
buzukia
blowjob
bizzies
bezzies
bezzant
bezique
azulejo
azotize
abbozzo
`
)
func HashStr(sep string) (uint32, uint32) {
hash := uint32(0)
for i := 0; i < len(sep); i++ {
hash = hash*PrimeRK + uint32(sep[i])
}
var pow, sq uint32 = 1, PrimeRK
for i := len(sep); i > 0; i >>= 1 {
if i&1 != 0 {
pow *= sq
}
sq *= sq
}
return hash, pow
}
func RabinKarpBake(needles []string) (set map[string]struct{}, hset map[uint32]struct{}, pow uint32) {
hset = make(map[uint32]struct{}, len(needles))
set = make(map[string]struct{}, len(needles))
for _, s := range needles {
var h uint32
h, pow = HashStr(s)
hset[h] = struct{}{}
set[s] = struct{}{}
}
return set, hset, pow
}
func IndexRabinKarpMultiple(s string, needles map[string]struct{}, hset map[uint32]struct{}, n int, pow uint32) int {
// Rabin-Karp search
var h uint32
for i := 0; i < n; i++ {
h = h*PrimeRK + uint32(s[i])
}
if _, ok := hset[h]; ok {
if _, ok := needles[s[:n]]; ok {
return 0
}
}
for i := n; i < len(s); {
h *= PrimeRK
h += uint32(s[i])
h -= pow * uint32(s[i-n])
i++
if _, ok := hset[h]; ok {
if _, ok := needles[s[i-n:i]]; ok {
return i - n
}
}
}
return -1
}
func naiveIndexAnySubstring(s string, needles []string) int {
res := 0
for _, needle := range needles {
idx := strings.Index(s, needle)
if idx >= 0 && idx < res {
res = idx
}
}
return res
}
func benchmarkRK(b *testing.B, s string, needles []string) {
set, hset, pow := RabinKarpBake(needles)
n := len(needles[0])
b.ResetTimer()
for i := 0; i < b.N; i++ {
IndexRabinKarpMultiple(s, set, hset, n, pow)
}
}
func benchmarkNaiveSearch(b *testing.B, s string, needles []string) {
for i := 0; i < b.N; i++ {
naiveIndexAnySubstring(s, needles)
}
}
func benchmarkRegexSearch(b *testing.B, s string, needles []string) {
re := regexp.MustCompile(strings.Join(needles, "|"))
b.ResetTimer()
for i := 0; i < b.N; i++ {
re.FindStringIndex(s)
}
}
func benchmarkCFAhoCorasick(b *testing.B, s string, needles []string) {
m := ahocorasick.NewStringMatcher(needles)
b.ResetTimer()
for i := 0; i < b.N; i++ {
m.Match([]byte(s))
}
}
func BenchmarkRKOrganic(b *testing.B) {
benchmarkRK(b, text1, wordlist1)
}
func BenchmarkNaiveOrganic(b *testing.B) {
benchmarkNaiveSearch(b, text1, wordlist1)
}
func BenchmarkRegexOrganic(b *testing.B) {
benchmarkRegexSearch(b, text1, wordlist1)
}
func BenchmarkACOrganic(b *testing.B) {
benchmarkCFAhoCorasick(b, text1, wordlist1)
}
func BenchmarkRKTargetRich(b *testing.B) {
benchmarkRK(b, text2, wordlist2)
}
func BenchmarkNaiveTargetRich(b *testing.B) {
benchmarkNaiveSearch(b, text2, wordlist2)
}
func BenchmarkRegexTargetRich(b *testing.B) {
benchmarkRegexSearch(b, text2, wordlist2)
}
func BenchmarkACTargetRich(b *testing.B) {
benchmarkCFAhoCorasick(b, text2, wordlist2)
}
func BenchmarkRKManyNeedlesNoMatch(b *testing.B) {
benchmarkRK(b, text1, wordlist2)
}
func BenchmarkNaiveManyNeedlesNoMatch(b *testing.B) {
benchmarkNaiveSearch(b, text1, wordlist2)
}
func BenchmarkRegexManyNeedlesNoMatch(b *testing.B) {
benchmarkRegexSearch(b, text1, wordlist2)
}
func BenchmarkACManyNeedlesNoMatch(b *testing.B) {
benchmarkCFAhoCorasick(b, text1, wordlist2)
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment