Last active
March 5, 2024 20:49
-
-
Save pelletier/46320448776c9ba9163270130ef556aa to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
"""Python script that automates the benchmarking of Go utf8.Valid function over | |
small inputs. | |
This script will clone the Go repository, build the Go toolchain, and run the | |
utf8.Valid benchmarks over small inputs, both on the CL and its parent, and plot | |
the time per operation. | |
Requirements: | |
- Python 3 | |
- Git | |
- Go recent enough to build the Go 1.21 toolchain. | |
- Gnuplot | |
Usage: | |
$ python3 benchmark-utf8valid.py | |
CL: https://go-review.googlesource.com/c/go/+/535838 | |
""" | |
import os | |
import re | |
import logging | |
import itertools | |
import statistics | |
logger = logging.getLogger("script") | |
logger.addHandler(logging.StreamHandler()) | |
def shell(command, can_fail=False): | |
exit_status = os.system(command) | |
logger.debug("Shell: '%s'", command) | |
exit_code = os.waitstatus_to_exitcode(exit_status) | |
logger.debug("Exit Code: '%s'", exit_code) | |
if exit_code is not 0 and not can_fail: | |
raise Exception(f"Shell command '{command}' failed with exit code {exit_code}") | |
return exit_code | |
def plot(base_path, avx2_path): | |
print("========== PLOT ============") | |
metadata = {} | |
meta_re = re.compile(r"^([a-z]+): (.+)") | |
with open(base_path) as f: | |
for line in f: | |
m = meta_re.match(line) | |
if m is None: | |
continue | |
[key, value] = m.groups() | |
metadata[key.strip()] = value.strip() | |
row_re = re.compile(r"^BenchmarkValid/small([0-9]+)-[0-9]+\s+[0-9]+\s+([0-9.]+) ns/op\s+") | |
series = { | |
"base": [], | |
"avx2": [], | |
} | |
for (name, path) in [('base', base_path), ('avx2', avx2_path)]: | |
with open(path) as f: | |
for line in f: | |
m = row_re.match(line) | |
if m is None: | |
continue | |
[size, ns_op] = m.groups() | |
series[name].append({"size": int(size), "time": float(ns_op)}) | |
data = {} | |
for name, points in series.items(): | |
new_points = [] | |
for k, g in itertools.groupby(points, lambda x: x["size"]): | |
mean = statistics.fmean(x["time"] for x in g) | |
new_points.append((k, mean)) | |
data[name] = new_points | |
dataFile = 'plot.dat' | |
titles = [] | |
with open(dataFile, 'w+') as f: | |
first = True | |
for name, points in data.items(): | |
titles.append(name) | |
if not first: | |
print("\n", file=f) | |
points = sorted(points) | |
for x, y in points: | |
print(" {} {}".format(x, y), file=f) | |
first = False | |
scriptFile = 'plot.p' | |
output = 'data.png' | |
with open(scriptFile, 'w+') as f: | |
print(f""" | |
set terminal svg enhanced mouse size 1024,758 | |
set terminal pngcairo size 1024,758 | |
set output '{output}' | |
set ylabel 'ns/op' | |
set xlabel 'bytes' | |
set key inside top left | |
""", file=f) | |
first = True | |
idx = 0 | |
for title in titles: | |
if not first: | |
print(", \\", file=f) | |
fileName = dataFile if first else '' | |
command = 'plot' if first else ' ' | |
print(f"{command} '{fileName}' index {idx} with linespoints linestyle {idx+1} title \"{title}\"".format(), file=f, end='') | |
first = False | |
idx += 1 | |
print("", file=f) | |
shell(f"cat {scriptFile}") | |
shell(f"gnuplot {scriptFile}") | |
def main(): | |
cl_number = "535838" | |
cl_branch = f"change-{cl_number}" | |
cl_version = 5 | |
logger.setLevel(logging.DEBUG) | |
if not os.path.exists("go"): | |
logger.info("Cloning Go") | |
shell("git clone https://go.googlesource.com/go") | |
else: | |
logger.info("go repository already exists") | |
os.chdir("go") | |
if shell(f"git show-ref --verify --quiet refs/heads/{cl_branch}", can_fail=True) != 0: | |
shell(f"git fetch https://go.googlesource.com/go refs/changes/38/{cl_number}/{cl_version} && git checkout -b {cl_branch} FETCH_HEAD") | |
else: | |
logger.info("CL branch %s already exists", cl_branch) | |
shell("git clean -fx") | |
shell("git reset HEAD --hard") | |
logger.info("Preparing CL branch %s", cl_branch) | |
shell(f"git checkout {cl_branch}") | |
if not os.path.exists("../avx2.txt"): | |
shell(f"cd src && ./make.bash") | |
shell(f"GOROOT=$PWD ./bin/go test -bench=BenchmarkValid$/small -count=6 ./src/unicode/utf8 | tee ../avx2.txt") | |
else: | |
logger.info("avx2.txt already exists. Skipping CL benchmark") | |
shell("git clean -fx") | |
shell("git reset HEAD --hard") | |
logger.info("Preparing parent commit") | |
shell(f"git checkout HEAD~1") | |
if not os.path.exists("../base.txt"): | |
with open("src/unicode/utf8/valid_test.go", "x") as f: | |
f.write(valid_test_go) | |
shell(f"cd src && ./make.bash") | |
shell(f"GOROOT=$PWD ./bin/go test -bench=BenchmarkValid$/small -count=6 ./src/unicode/utf8 | tee ../base.txt") | |
else: | |
logger.info("base.txt already exists. Skipping base benchmark") | |
logger.info("Benchmarking complete. Plotting results") | |
plot("../base.txt", "../avx2.txt") | |
valid_test_go = r""" | |
package utf8_test | |
import ( | |
"bytes" | |
"fmt" | |
"testing" | |
"unicode/utf8" | |
) | |
var valid1k = bytes.Repeat([]byte("0123456789日本語日本語日本語日abcdefghijklmnopqrstuvwx"), 16) | |
var valid1M = bytes.Repeat(valid1k, 1024) | |
var someutf8 = []byte("\xF4\x8F\xBF\xBF") | |
type input struct { | |
name string | |
data []byte | |
} | |
func benchmarkInputs() []input { | |
inputs := []input{ | |
{"1kValid", valid1k}, | |
{"1MValid", valid1M}, | |
{"10ASCII", []byte("0123456789")}, | |
{"1kASCII", bytes.Repeat([]byte{'A'}, 1024)}, | |
{"1MASCII", bytes.Repeat([]byte{'A'}, 1024*1024)}, | |
{"1kInvalid", append([]byte{'\xF4'}, bytes.Repeat([]byte{'A'}, 1023)...)}, | |
{"10Japan", []byte("日本語日本語日本語日")}, | |
} | |
const KiB = 1024 | |
const MiB = 1024 * 1024 | |
for i := 0; i <= 400/(2*len(someutf8)); i++ { | |
d := bytes.Repeat(someutf8, i*2) | |
inputs = append(inputs, input{ | |
name: fmt.Sprintf("small%d", len(d)), | |
data: d, | |
}) | |
} | |
for _, i := range []int{1 * KiB, 8 * KiB, 16 * KiB, 64 * KiB, 1 * MiB, 8 * MiB, 32 * MiB, 64 * MiB} { | |
d := bytes.Repeat(someutf8, i/len(someutf8)) | |
inputs = append(inputs, input{ | |
name: fmt.Sprintf("%d", len(d)), | |
data: d, | |
}) | |
} | |
for _, i := range []int{300, 316} { | |
d := bytes.Repeat(someutf8, i/len(someutf8)) | |
inputs = append(inputs, input{ | |
name: fmt.Sprintf("tail%d", len(d)), | |
data: d, | |
}) | |
} | |
return inputs | |
} | |
func BenchmarkValid(b *testing.B) { | |
for _, input := range benchmarkInputs() { | |
b.Run(input.name, func(b *testing.B) { | |
b.SetBytes(int64(len(input.data))) | |
b.ResetTimer() | |
for i := 0; i < b.N; i++ { | |
utf8.Valid(input.data) | |
} | |
}) | |
} | |
} | |
""" | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment