Skip to content

Instantly share code, notes, and snippets.

@neon-sunset
Last active August 12, 2024 12:04
Show Gist options
  • Save neon-sunset/72e6aa57c6a4c5eb0e2711e11f4f31cf to your computer and use it in GitHub Desktop.
Save neon-sunset/72e6aa57c6a4c5eb0e2711e11f4f31cf to your computer and use it in GitHub Desktop.
Comparison of Go and C# AOT in BinaryTrees microbenchmark extracted from BenchmarksGame
// The Computer Language Benchmarks Game
// https://benchmarksgame-team.pages.debian.net/benchmarksgame/
//
// based on
// - Tristan Dupont Java #7
// - Anthony Lloyd C#
// - Isaac Guoy C# #7
// - Steve (hez2010) https://github.com/dotnet/runtime/issues/101006#issue-2241502984
// contributed by Arseniy Zlobintsev
const int minDepth = 4;
var maxDepth = args.Length is 0 ? 10
: Math.Max(minDepth + 2, int.Parse(args[0]));
Console.WriteLine(
$"stretch tree of depth {maxDepth + 1}\t check: {new TreeNode(maxDepth + 1).Check()}");
var longLivedTree = new TreeNode(maxDepth);
var workers = Environment.ProcessorCount;
var results = new string[((maxDepth - minDepth) / 2) + 1];
for (var i = 0; i < results.Length; i++) {
var depth = (i * 2) + minDepth;
var n = (1 << (maxDepth - depth + minDepth)) / workers;
var checks = new int[workers];
Parallel.For(0, workers, w => {
var check = 0;
for (var i = n; i > 0; i--)
check += new TreeNode(depth).Check();
checks[w] = check;
});
results[i] = $"{n * workers}\t trees of depth {depth}\t check: {checks.Sum()}";
}
foreach (var r in results)
Console.WriteLine(r);
Console.WriteLine(
$"long lived tree of depth {maxDepth}\t check: {longLivedTree.Check()}");
readonly struct TreeNode(int depth) {
record Next(TreeNode Left, TreeNode Right);
readonly Next? next = depth > 0
? new(new(depth - 1), new(depth - 1))
: null;
public int Check() {
return next is null ? 1 : 1 + next.Left.Check() + next.Right.Check();
}
}
<Project Sdk="Microsoft.NET.Sdk">
<PropertyGroup>
<OutputType>Exe</OutputType>
<TargetFramework>net8.0</TargetFramework>
<Nullable>enable</Nullable>
<ImplicitUsings>enable</ImplicitUsings>
<PublishAot>true</PublishAot>
<InvariantGlobalization>true</InvariantGlobalization>
<ServerGarbageCollection>true</ServerGarbageCollection>
<IlcInstructionSet>native</IlcInstructionSet>
<OptimizationPreference>Speed</OptimizationPreference>
</PropertyGroup>
</Project>
module binary-trees
go 1.22.6
// The Computer Language Benchmarks Game
// http://benchmarksgame.alioth.debian.org/
//
// Go implementation of binary-trees, based on the reference implementation
// gcc #3, on Go #8 (which is based on Rust #4)
//
// Comments aim to be analogous as those in the reference implementation and are
// intentionally verbose, to help programmers unexperienced in GO to understand
// the implementation.
//
// The following alternative implementations were considered before submitting
// this code. All of them had worse readability and didn't yield better results
// on my local machine:
//
// 0. general:
// 0.1 using uint32, instead of int;
//
// 1. func Count:
// 1.1 using a local stack, instead of using a recursive implementation; the
// performance degraded, even using a pre-allocated slice as stack and
// manually handling its size;
// 1.2 assigning Left and Right to nil after counting nodes; the idea to remove
// references to instances no longer needed was to make GC easier, but this
// did not work as intended;
// 1.3 using a walker and channel, sending 1 on each node; although this looked
// idiomatic to GO, the performance suffered a lot;
// 2. func NewTree:
// 2.1 allocating all tree nodes on a tree slice upfront and making references
// to those instances, instead of allocating two sub-trees on each call;
// this did not improve performance;
//
// Contributed by Gerardo Lima
// Reviewed by Diogo Simoes
// Based on previous work from Adam Shaver, Isaac Gouy, Marcel Ibes Jeremy,
// Zerfas, Jon Harrop, Alex Mizrahi, Bruno Coutinho, ...
//
package main
import (
"flag"
"fmt"
"strconv"
"sync"
)
type Tree struct {
Left *Tree
Right *Tree
}
// Count the nodes in the given complete binary tree.
func (t *Tree) Count() int {
// Only test the Left node (this binary tree is expected to be complete).
if t.Left == nil {
return 1
}
return 1 + t.Right.Count() + t.Left.Count()
}
// Create a complete binary tree of `depth` and return it as a pointer.
func NewTree(depth int) *Tree {
if depth > 0 {
return &Tree{Left: NewTree(depth - 1), Right: NewTree(depth - 1)}
} else {
return &Tree{}
}
}
func Run(maxDepth int) {
var wg sync.WaitGroup
// Set minDepth to 4 and maxDepth to the maximum of maxDepth and minDepth +2.
const minDepth = 4
if maxDepth < minDepth+2 {
maxDepth = minDepth + 2
}
// Create an indexed string buffer for outputing the result in order.
outCurr := 0
outSize := 3 + (maxDepth-minDepth)/2
outBuff := make([]string, outSize)
// Create binary tree of depth maxDepth+1, compute its Count and set the
// first position of the outputBuffer with its statistics.
wg.Add(1)
go func() {
tree := NewTree(maxDepth + 1)
msg := fmt.Sprintf("stretch tree of depth %d\t check: %d",
maxDepth+1,
tree.Count())
outBuff[0] = msg
wg.Done()
}()
// Create a long-lived binary tree of depth maxDepth. Its statistics will be
// handled later.
var longLivedTree *Tree
wg.Add(1)
go func() {
longLivedTree = NewTree(maxDepth)
wg.Done()
}()
// Create a lot of binary trees, of depths ranging from minDepth to maxDepth,
// compute and tally up all their Count and record the statistics.
for depth := minDepth; depth <= maxDepth; depth += 2 {
iterations := 1 << (maxDepth - depth + minDepth)
outCurr++
wg.Add(1)
go func(depth, iterations, index int) {
acc := 0
for i := 0; i < iterations; i++ {
// Create a binary tree of depth and accumulate total counter with its
// node count.
a := NewTree(depth)
acc += a.Count()
}
msg := fmt.Sprintf("%d\t trees of depth %d\t check: %d",
iterations,
depth,
acc)
outBuff[index] = msg
wg.Done()
}(depth, iterations, outCurr)
}
wg.Wait()
// Compute the checksum of the long-lived binary tree that we created
// earlier and store its statistics.
msg := fmt.Sprintf("long lived tree of depth %d\t check: %d",
maxDepth,
longLivedTree.Count())
outBuff[outSize-1] = msg
// Print the statistics for all of the various tree depths.
for _, m := range outBuff {
fmt.Println(m)
}
}
func main() {
n := 0
flag.Parse()
if flag.NArg() > 0 {
n, _ = strconv.Atoi(flag.Arg(0))
}
Run(n)
}
@neon-sunset
Copy link
Author

neon-sunset commented Aug 12, 2024

Submissions

The fastest Go implementation (lowest wall-clock time) was chosen out of Binary-Trees submissions: https://benchmarksgame-team.pages.debian.net/benchmarksgame/program/binarytrees-go-2.html

For C#, a new implementation was written because all existing submissions in BenchmarksGame are dated, do not use all cores and do not match what is considered "terse idiomatic C#" of today.

It was submitted to BenchmarksGame and is now pending review: https://salsa.debian.org/benchmarksgame-team/benchmarksgame/-/issues/539

Build

go build
dotnet publish -o .

.NET project was created from dotnet new console --aot template and the following properties were added:

<ServerGarbageCollection>true</ServerGarbageCollection>
<IlcInstructionSet>native</IlcInstructionSet>
<OptimizationPreference>Speed</OptimizationPreference>

Using first one is a standard practice in allocation-heavy code (despite server name, many throughput-sensitive client applications choose to use it, like Roslyn LSP)
Using the second two is equivalent to -O3 -march=native which is what effectively some of BenchmarksGame submissions use.
Note that on osx-arm64 target, IlcInstructionSet=native is effectively a no-op because the "base" ISA target for such platform is already equivalent to native (there may be slight caveats around ISEL but you can verify that it has no impact).

It does not seem that Go exposes something similar that would be relevant in case of macOS ARM64 target, please let me know if this is not the case.

Environment

macOS 15.0 24A5309e arm64
Apple M1 Pro (6P + 2E)

go version go1.22.6 darwin/arm64
.NET 8.0.204

Results

Each binary was ran 5 times and the fastest respective result was chosen.
GOMAXPROCS=8 was used which matches the amount of physical cores on the system. .NET threadpool by default spawns the same amount of worker threads, with the count being dynamically increased or reduced depending on utilization and worker saturation (i.e. native code sleeping a worker thread will cause the threadpool to mitigate this by injecting new worker thread(s), OS preemptive scheduling proves to be effective at load-balancing the execution fairly in the case of such oversubscription)

Go

time GOMAXPROCS=8 ./binary-trees 21 
stretch tree of depth 22         check: 8388607
2097152  trees of depth 4        check: 65011712
524288   trees of depth 6        check: 66584576
131072   trees of depth 8        check: 66977792
32768    trees of depth 10       check: 67076096
8192     trees of depth 12       check: 67100672
2048     trees of depth 14       check: 67106816
512      trees of depth 16       check: 67108352
128      trees of depth 18       check: 67108736
32       trees of depth 20       check: 67108832
long lived tree of depth 21      check: 4194303

________________________________________________________
Executed in    5.18 secs    fish           external
   usr time   37.10 secs    0.23 millis   37.10 secs
   sys time    0.90 secs    1.18 millis    0.90 secs

C#

time ./BinaryTrees 21
stretch tree of depth 22         check: 8388607
2097152  trees of depth 4        check: 65011712
524288   trees of depth 6        check: 66584576
131072   trees of depth 8        check: 66977792
32768    trees of depth 10       check: 67076096
8192     trees of depth 12       check: 67100672
2048     trees of depth 14       check: 67106816
512      trees of depth 16       check: 67108352
128      trees of depth 18       check: 67108736
32       trees of depth 20       check: 67108832
long lived tree of depth 21      check: 4194303

________________________________________________________
Executed in    1.83 secs    fish           external
   usr time    7.31 secs    0.10 millis    7.31 secs
   sys time    0.33 secs    1.26 millis    0.33 secs

Feel free to replicate the results and post them here!

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment