pelletier · March 5, 2024 20:49
diff --git a/benchmark-utf8valid.py b/benchmark-utf8valid.py
 """Python script that automates the benchmarking of Go utf8.Valid function over
 small inputs.

 This script will clone the Go repository, build the Go toolchain, and run the
 utf8.Valid benchmarks over small inputs, both on the CL and its parent, and plot
 the time per operation.

 Requirements:

 - Python 3
 - Git
 - Go recent enough to build the Go 1.21 toolchain.
 - Gnuplot

 Usage:

    $ python3 benchmark-utf8valid.py


 CL: https://go-review.googlesource.com/c/go/+/535838

 """


 import os
 import re
 import logging
 import itertools
 import statistics


 logger = logging.getLogger("script")
 logger.addHandler(logging.StreamHandler())


 def shell(command, can_fail=False):
    exit_status = os.system(command)
    logger.debug("Shell: '%s'", command)
    exit_code = os.waitstatus_to_exitcode(exit_status)
    logger.debug("Exit Code: '%s'", exit_code)
    if exit_code is not 0 and not can_fail:
        raise Exception(f"Shell command '{command}' failed with exit code {exit_code}")
    return exit_code

 def plot(base_path, avx2_path):
    print("========== PLOT ============")

    metadata = {}
    meta_re = re.compile(r"^([a-z]+): (.+)")


    with open(base_path) as f:
        for line in f:
            m = meta_re.match(line)
            if m is None:
                continue
            [key, value] = m.groups()
            metadata[key.strip()] = value.strip()

    row_re = re.compile(r"^BenchmarkValid/small([0-9]+)-[0-9]+\s+[0-9]+\s+([0-9.]+) ns/op\s+")

    series = {
        "base": [],
        "avx2": [],
    }

    for (name, path) in [('base', base_path), ('avx2', avx2_path)]:
        with open(path) as f:
            for line in f:
                m = row_re.match(line)
                if m is None:
                    continue
                [size, ns_op] = m.groups()
                series[name].append({"size": int(size), "time": float(ns_op)})

    data = {}
    for name, points in series.items():
        new_points = []

        for k, g in itertools.groupby(points, lambda x: x["size"]):
            mean = statistics.fmean(x["time"] for x in g)
            new_points.append((k, mean))

        data[name] = new_points

    dataFile = 'plot.dat'
    titles = []

    with open(dataFile, 'w+') as f:
        first = True
        for name, points in data.items():
            titles.append(name)
            if not first:
                print("\n", file=f)
            points = sorted(points)
            for x, y in points:
                print(" {}  {}".format(x, y), file=f)
            first = False

    scriptFile = 'plot.p'
    output = 'data.png'

    with open(scriptFile, 'w+') as f:
        print(f"""
    set terminal svg enhanced mouse size 1024,758
    set terminal pngcairo size 1024,758

    set output '{output}'

    set ylabel 'ns/op'
    set xlabel 'bytes'

    set key inside top left
    """, file=f)

        first = True
        idx = 0
        for title in titles:
            if not first:
                print(", \\", file=f)
            fileName = dataFile if first else ''
            command = 'plot' if first else '    '
            print(f"{command} '{fileName}' index {idx} with linespoints linestyle {idx+1} title \"{title}\"".format(), file=f, end='')
            first = False
            idx += 1
        print("", file=f)

    shell(f"cat {scriptFile}")

    shell(f"gnuplot {scriptFile}")

 def main():
    cl_number = "535838"
    cl_branch = f"change-{cl_number}"
    cl_version = 5

    logger.setLevel(logging.DEBUG)

    if not os.path.exists("go"):
        logger.info("Cloning Go")
        shell("git clone https://go.googlesource.com/go")
    else:
        logger.info("go repository already exists")

    os.chdir("go")

    if shell(f"git show-ref --verify --quiet refs/heads/{cl_branch}", can_fail=True) != 0:
        shell(f"git fetch https://go.googlesource.com/go refs/changes/38/{cl_number}/{cl_version} && git checkout -b {cl_branch} FETCH_HEAD")
    else:
        logger.info("CL branch %s already exists", cl_branch)

    shell("git clean -fx")
    shell("git reset HEAD --hard")

    logger.info("Preparing CL branch %s", cl_branch)
    shell(f"git checkout {cl_branch}")

    if not os.path.exists("../avx2.txt"):
        shell(f"cd src && ./make.bash")
        shell(f"GOROOT=$PWD ./bin/go test -bench=BenchmarkValid$/small -count=6 ./src/unicode/utf8 | tee ../avx2.txt")
    else:
        logger.info("avx2.txt already exists. Skipping CL benchmark")


    shell("git clean -fx")
    shell("git reset HEAD --hard")

    logger.info("Preparing parent commit")
    shell(f"git checkout HEAD~1")

    if not os.path.exists("../base.txt"):
        with open("src/unicode/utf8/valid_test.go", "x") as f:
            f.write(valid_test_go)
        shell(f"cd src && ./make.bash")
        shell(f"GOROOT=$PWD ./bin/go test -bench=BenchmarkValid$/small -count=6 ./src/unicode/utf8 | tee ../base.txt")
    else:
        logger.info("base.txt already exists. Skipping base benchmark")



    logger.info("Benchmarking complete. Plotting results")
    plot("../base.txt", "../avx2.txt")

 valid_test_go = r"""
 package utf8_test

 import (
        "bytes"
        "fmt"
        "testing"
        "unicode/utf8"
 )

 var valid1k = bytes.Repeat([]byte("0123456789日本語日本語日本語日abcdefghijklmnopqrstuvwx"), 16)
 var valid1M = bytes.Repeat(valid1k, 1024)
 var someutf8 = []byte("\xF4\x8F\xBF\xBF")

 type input struct {
        name string
        data []byte
 }

 func benchmarkInputs() []input {
        inputs := []input{
                {"1kValid", valid1k},
                {"1MValid", valid1M},
                {"10ASCII", []byte("0123456789")},
                {"1kASCII", bytes.Repeat([]byte{'A'}, 1024)},
                {"1MASCII", bytes.Repeat([]byte{'A'}, 1024*1024)},
                {"1kInvalid", append([]byte{'\xF4'}, bytes.Repeat([]byte{'A'}, 1023)...)},
                {"10Japan", []byte("日本語日本語日本語日")},
        }

        const KiB = 1024
        const MiB = 1024 * 1024

        for i := 0; i <= 400/(2*len(someutf8)); i++ {
                d := bytes.Repeat(someutf8, i*2)
                inputs = append(inputs, input{
                        name: fmt.Sprintf("small%d", len(d)),
                        data: d,
                })
        }

        for _, i := range []int{1 * KiB, 8 * KiB, 16 * KiB, 64 * KiB, 1 * MiB, 8 * MiB, 32 * MiB, 64 * MiB} {
                d := bytes.Repeat(someutf8, i/len(someutf8))
                inputs = append(inputs, input{
                        name: fmt.Sprintf("%d", len(d)),
                        data: d,
                })
        }

        for _, i := range []int{300, 316} {
                d := bytes.Repeat(someutf8, i/len(someutf8))
                inputs = append(inputs, input{
                        name: fmt.Sprintf("tail%d", len(d)),
                        data: d,
                })
        }
        return inputs
 }

 func BenchmarkValid(b *testing.B) {
        for _, input := range benchmarkInputs() {
                b.Run(input.name, func(b *testing.B) {
                        b.SetBytes(int64(len(input.data)))
                        b.ResetTimer()
                        for i := 0; i < b.N; i++ {
                                utf8.Valid(input.data)
                        }
                })
        }
 }
 """


 if __name__ == '__main__':
    main()
	"""Python script that automates the benchmarking of Go utf8.Valid function over
	small inputs.

	This script will clone the Go repository, build the Go toolchain, and run the
	utf8.Valid benchmarks over small inputs, both on the CL and its parent, and plot
	the time per operation.

	Requirements:

	- Python 3
	- Git
	- Go recent enough to build the Go 1.21 toolchain.
	- Gnuplot

	Usage:

	$ python3 benchmark-utf8valid.py


	CL: https://go-review.googlesource.com/c/go/+/535838

	"""


	import os
	import re
	import logging
	import itertools
	import statistics


	logger = logging.getLogger("script")
	logger.addHandler(logging.StreamHandler())


	def shell(command, can_fail=False):
	exit_status = os.system(command)
	logger.debug("Shell: '%s'", command)
	exit_code = os.waitstatus_to_exitcode(exit_status)
	logger.debug("Exit Code: '%s'", exit_code)
	if exit_code is not 0 and not can_fail:
	raise Exception(f"Shell command '{command}' failed with exit code {exit_code}")
	return exit_code

	def plot(base_path, avx2_path):
	print("========== PLOT ============")

	metadata = {}
	meta_re = re.compile(r"^([a-z]+): (.+)")


	with open(base_path) as f:
	for line in f:
	m = meta_re.match(line)
	if m is None:
	continue
	[key, value] = m.groups()
	metadata[key.strip()] = value.strip()

	row_re = re.compile(r"^BenchmarkValid/small([0-9]+)-[0-9]+\s+[0-9]+\s+([0-9.]+) ns/op\s+")

	series = {
	"base": [],
	"avx2": [],
	}

	for (name, path) in [('base', base_path), ('avx2', avx2_path)]:
	with open(path) as f:
	for line in f:
	m = row_re.match(line)
	if m is None:
	continue
	[size, ns_op] = m.groups()
	series[name].append({"size": int(size), "time": float(ns_op)})

	data = {}
	for name, points in series.items():
	new_points = []

	for k, g in itertools.groupby(points, lambda x: x["size"]):
	mean = statistics.fmean(x["time"] for x in g)
	new_points.append((k, mean))

	data[name] = new_points

	dataFile = 'plot.dat'
	titles = []

	with open(dataFile, 'w+') as f:
	first = True
	for name, points in data.items():
	titles.append(name)
	if not first:
	print("\n", file=f)
	points = sorted(points)
	for x, y in points:
	print(" {} {}".format(x, y), file=f)
	first = False

	scriptFile = 'plot.p'
	output = 'data.png'

	with open(scriptFile, 'w+') as f:
	print(f"""
	set terminal svg enhanced mouse size 1024,758
	set terminal pngcairo size 1024,758

	set output '{output}'

	set ylabel 'ns/op'
	set xlabel 'bytes'

	set key inside top left
	""", file=f)

	first = True
	idx = 0
	for title in titles:
	if not first:
	print(", \\", file=f)
	fileName = dataFile if first else ''
	command = 'plot' if first else ' '
	print(f"{command} '{fileName}' index {idx} with linespoints linestyle {idx+1} title \"{title}\"".format(), file=f, end='')
	first = False
	idx += 1
	print("", file=f)

	shell(f"cat {scriptFile}")

	shell(f"gnuplot {scriptFile}")

	def main():
	cl_number = "535838"
	cl_branch = f"change-{cl_number}"
	cl_version = 5

	logger.setLevel(logging.DEBUG)

	if not os.path.exists("go"):
	logger.info("Cloning Go")
	shell("git clone https://go.googlesource.com/go")
	else:
	logger.info("go repository already exists")

	os.chdir("go")

	if shell(f"git show-ref --verify --quiet refs/heads/{cl_branch}", can_fail=True) != 0:
	shell(f"git fetch https://go.googlesource.com/go refs/changes/38/{cl_number}/{cl_version} && git checkout -b {cl_branch} FETCH_HEAD")
	else:
	logger.info("CL branch %s already exists", cl_branch)

	shell("git clean -fx")
	shell("git reset HEAD --hard")

	logger.info("Preparing CL branch %s", cl_branch)
	shell(f"git checkout {cl_branch}")

	if not os.path.exists("../avx2.txt"):
	shell(f"cd src && ./make.bash")
	shell(f"GOROOT=$PWD ./bin/go test -bench=BenchmarkValid$/small -count=6 ./src/unicode/utf8 \| tee ../avx2.txt")
	else:
	logger.info("avx2.txt already exists. Skipping CL benchmark")


	shell("git clean -fx")
	shell("git reset HEAD --hard")

	logger.info("Preparing parent commit")
	shell(f"git checkout HEAD~1")

	if not os.path.exists("../base.txt"):
	with open("src/unicode/utf8/valid_test.go", "x") as f:
	f.write(valid_test_go)
	shell(f"cd src && ./make.bash")
	shell(f"GOROOT=$PWD ./bin/go test -bench=BenchmarkValid$/small -count=6 ./src/unicode/utf8 \| tee ../base.txt")
	else:
	logger.info("base.txt already exists. Skipping base benchmark")



	logger.info("Benchmarking complete. Plotting results")
	plot("../base.txt", "../avx2.txt")

	valid_test_go = r"""
	package utf8_test

	import (
	"bytes"
	"fmt"
	"testing"
	"unicode/utf8"
	)

	var valid1k = bytes.Repeat([]byte("0123456789日本語日本語日本語日abcdefghijklmnopqrstuvwx"), 16)
	var valid1M = bytes.Repeat(valid1k, 1024)
	var someutf8 = []byte("\xF4\x8F\xBF\xBF")

	type input struct {
	name string
	data []byte
	}

	func benchmarkInputs() []input {
	inputs := []input{
	{"1kValid", valid1k},
	{"1MValid", valid1M},
	{"10ASCII", []byte("0123456789")},
	{"1kASCII", bytes.Repeat([]byte{'A'}, 1024)},
	{"1MASCII", bytes.Repeat([]byte{'A'}, 1024*1024)},
	{"1kInvalid", append([]byte{'\xF4'}, bytes.Repeat([]byte{'A'}, 1023)...)},
	{"10Japan", []byte("日本語日本語日本語日")},
	}

	const KiB = 1024
	const MiB = 1024 * 1024

	for i := 0; i <= 400/(2*len(someutf8)); i++ {
	d := bytes.Repeat(someutf8, i*2)
	inputs = append(inputs, input{
	name: fmt.Sprintf("small%d", len(d)),
	data: d,
	})
	}

	for _, i := range []int{1 * KiB, 8 * KiB, 16 * KiB, 64 * KiB, 1 * MiB, 8 * MiB, 32 * MiB, 64 * MiB} {
	d := bytes.Repeat(someutf8, i/len(someutf8))
	inputs = append(inputs, input{
	name: fmt.Sprintf("%d", len(d)),
	data: d,
	})
	}

	for _, i := range []int{300, 316} {
	d := bytes.Repeat(someutf8, i/len(someutf8))
	inputs = append(inputs, input{
	name: fmt.Sprintf("tail%d", len(d)),
	data: d,
	})
	}
	return inputs
	}

	func BenchmarkValid(b *testing.B) {
	for _, input := range benchmarkInputs() {
	b.Run(input.name, func(b *testing.B) {
	b.SetBytes(int64(len(input.data)))
	b.ResetTimer()
	for i := 0; i < b.N; i++ {
	utf8.Valid(input.data)
	}
	})
	}
	}
	"""


	if __name__ == '__main__':
	main()