Skip to content

Instantly share code, notes, and snippets.

@pelletier
Last active March 5, 2024 20:49
Show Gist options
  • Save pelletier/46320448776c9ba9163270130ef556aa to your computer and use it in GitHub Desktop.
Save pelletier/46320448776c9ba9163270130ef556aa to your computer and use it in GitHub Desktop.
"""Python script that automates the benchmarking of Go utf8.Valid function over
small inputs.
This script will clone the Go repository, build the Go toolchain, and run the
utf8.Valid benchmarks over small inputs, both on the CL and its parent, and plot
the time per operation.
Requirements:
- Python 3
- Git
- Go recent enough to build the Go 1.21 toolchain.
- Gnuplot
Usage:
$ python3 benchmark-utf8valid.py
CL: https://go-review.googlesource.com/c/go/+/535838
"""
import os
import re
import logging
import itertools
import statistics
logger = logging.getLogger("script")
logger.addHandler(logging.StreamHandler())
def shell(command, can_fail=False):
exit_status = os.system(command)
logger.debug("Shell: '%s'", command)
exit_code = os.waitstatus_to_exitcode(exit_status)
logger.debug("Exit Code: '%s'", exit_code)
if exit_code is not 0 and not can_fail:
raise Exception(f"Shell command '{command}' failed with exit code {exit_code}")
return exit_code
def plot(base_path, avx2_path):
print("========== PLOT ============")
metadata = {}
meta_re = re.compile(r"^([a-z]+): (.+)")
with open(base_path) as f:
for line in f:
m = meta_re.match(line)
if m is None:
continue
[key, value] = m.groups()
metadata[key.strip()] = value.strip()
row_re = re.compile(r"^BenchmarkValid/small([0-9]+)-[0-9]+\s+[0-9]+\s+([0-9.]+) ns/op\s+")
series = {
"base": [],
"avx2": [],
}
for (name, path) in [('base', base_path), ('avx2', avx2_path)]:
with open(path) as f:
for line in f:
m = row_re.match(line)
if m is None:
continue
[size, ns_op] = m.groups()
series[name].append({"size": int(size), "time": float(ns_op)})
data = {}
for name, points in series.items():
new_points = []
for k, g in itertools.groupby(points, lambda x: x["size"]):
mean = statistics.fmean(x["time"] for x in g)
new_points.append((k, mean))
data[name] = new_points
dataFile = 'plot.dat'
titles = []
with open(dataFile, 'w+') as f:
first = True
for name, points in data.items():
titles.append(name)
if not first:
print("\n", file=f)
points = sorted(points)
for x, y in points:
print(" {} {}".format(x, y), file=f)
first = False
scriptFile = 'plot.p'
output = 'data.png'
with open(scriptFile, 'w+') as f:
print(f"""
set terminal svg enhanced mouse size 1024,758
set terminal pngcairo size 1024,758
set output '{output}'
set ylabel 'ns/op'
set xlabel 'bytes'
set key inside top left
""", file=f)
first = True
idx = 0
for title in titles:
if not first:
print(", \\", file=f)
fileName = dataFile if first else ''
command = 'plot' if first else ' '
print(f"{command} '{fileName}' index {idx} with linespoints linestyle {idx+1} title \"{title}\"".format(), file=f, end='')
first = False
idx += 1
print("", file=f)
shell(f"cat {scriptFile}")
shell(f"gnuplot {scriptFile}")
def main():
cl_number = "535838"
cl_branch = f"change-{cl_number}"
cl_version = 5
logger.setLevel(logging.DEBUG)
if not os.path.exists("go"):
logger.info("Cloning Go")
shell("git clone https://go.googlesource.com/go")
else:
logger.info("go repository already exists")
os.chdir("go")
if shell(f"git show-ref --verify --quiet refs/heads/{cl_branch}", can_fail=True) != 0:
shell(f"git fetch https://go.googlesource.com/go refs/changes/38/{cl_number}/{cl_version} && git checkout -b {cl_branch} FETCH_HEAD")
else:
logger.info("CL branch %s already exists", cl_branch)
shell("git clean -fx")
shell("git reset HEAD --hard")
logger.info("Preparing CL branch %s", cl_branch)
shell(f"git checkout {cl_branch}")
if not os.path.exists("../avx2.txt"):
shell(f"cd src && ./make.bash")
shell(f"GOROOT=$PWD ./bin/go test -bench=BenchmarkValid$/small -count=6 ./src/unicode/utf8 | tee ../avx2.txt")
else:
logger.info("avx2.txt already exists. Skipping CL benchmark")
shell("git clean -fx")
shell("git reset HEAD --hard")
logger.info("Preparing parent commit")
shell(f"git checkout HEAD~1")
if not os.path.exists("../base.txt"):
with open("src/unicode/utf8/valid_test.go", "x") as f:
f.write(valid_test_go)
shell(f"cd src && ./make.bash")
shell(f"GOROOT=$PWD ./bin/go test -bench=BenchmarkValid$/small -count=6 ./src/unicode/utf8 | tee ../base.txt")
else:
logger.info("base.txt already exists. Skipping base benchmark")
logger.info("Benchmarking complete. Plotting results")
plot("../base.txt", "../avx2.txt")
valid_test_go = r"""
package utf8_test
import (
"bytes"
"fmt"
"testing"
"unicode/utf8"
)
var valid1k = bytes.Repeat([]byte("0123456789日本語日本語日本語日abcdefghijklmnopqrstuvwx"), 16)
var valid1M = bytes.Repeat(valid1k, 1024)
var someutf8 = []byte("\xF4\x8F\xBF\xBF")
type input struct {
name string
data []byte
}
func benchmarkInputs() []input {
inputs := []input{
{"1kValid", valid1k},
{"1MValid", valid1M},
{"10ASCII", []byte("0123456789")},
{"1kASCII", bytes.Repeat([]byte{'A'}, 1024)},
{"1MASCII", bytes.Repeat([]byte{'A'}, 1024*1024)},
{"1kInvalid", append([]byte{'\xF4'}, bytes.Repeat([]byte{'A'}, 1023)...)},
{"10Japan", []byte("日本語日本語日本語日")},
}
const KiB = 1024
const MiB = 1024 * 1024
for i := 0; i <= 400/(2*len(someutf8)); i++ {
d := bytes.Repeat(someutf8, i*2)
inputs = append(inputs, input{
name: fmt.Sprintf("small%d", len(d)),
data: d,
})
}
for _, i := range []int{1 * KiB, 8 * KiB, 16 * KiB, 64 * KiB, 1 * MiB, 8 * MiB, 32 * MiB, 64 * MiB} {
d := bytes.Repeat(someutf8, i/len(someutf8))
inputs = append(inputs, input{
name: fmt.Sprintf("%d", len(d)),
data: d,
})
}
for _, i := range []int{300, 316} {
d := bytes.Repeat(someutf8, i/len(someutf8))
inputs = append(inputs, input{
name: fmt.Sprintf("tail%d", len(d)),
data: d,
})
}
return inputs
}
func BenchmarkValid(b *testing.B) {
for _, input := range benchmarkInputs() {
b.Run(input.name, func(b *testing.B) {
b.SetBytes(int64(len(input.data)))
b.ResetTimer()
for i := 0; i < b.N; i++ {
utf8.Valid(input.data)
}
})
}
}
"""
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment