mratsim · September 21, 2018 17:58
diff --git a/mixin_rand.nim b/mixin_rand.nim
 # MIT License
 # Copyright (c) 2018 Mamy André-Ratsimbazafy

 ## This files gives basic tensor library functionality, because yes we can
 import strformat, macros, sequtils, random

 type
  Tensor[Rank: static[int], T] = object
    ## Tensor data structure stored on Cpu
    ##   - ``shape``: Dimensions of the tensor
    ##   - ``strides``: Numbers of items to skip to get the next item along a dimension.
    ##   - ``offset``: Offset to get the first item of the tensor. Note: offset can be negative, in particular for slices.
    ##   - ``storage``: A data storage for the tensor
    ##   - Rank is part of the type for optimization purposes
    ##
    ## Warning ⚠:
    ##   Assignment ```var a = b``` does not copy the data. Data modification on one tensor will be reflected on the other.
    ##   However modification on metadata (shape, strides or offset) will not affect the other tensor.
    shape: array[Rank, int]
    strides: array[Rank, int]
    offset: int
    storage: CpuStorage[T]

  CpuStorage*{.shallow.}[T] = object
    ## Data storage for the tensor, copies are shallow by default
    data*: seq[T]

 template tensor(result: var Tensor, shape: array) =
  result.shape = shape

  var accum = 1
  for i in countdown(Rank - 1, 0):
    result.strides[i] = accum
    accum *= shape[i]

 func newTensor*[Rank: static[int], T](shape: array[Rank, int]): Tensor[Rank, T] =
  tensor(result, shape)
  result.storage.data = newSeq[T](shape.product)

 proc randomTensor*[Rank: static[int], T](shape: array[Rank, int], max: T): Tensor[Rank, T] =
  mixin rand
  tensor(result, shape)
  result.storage.data = newSeqWith(shape.product, T(rand(max)))

 func getIndex[Rank, T](t: Tensor[Rank, T], idx: array[Rank, int]): int {.inline.} =
  ## Convert [i, j, k, l ...] to the memory location referred by the index
  result = t.offset
  for i in 0 ..< t.Rank:
    {.unroll.} # I'm sad this doesn't work yet
    result += t.strides[i] * idx[i]

 func `[]`[Rank, T](t: Tensor[Rank, T], idx: array[Rank, int]): T {.inline.}=
  ## Index tensor
  t.storage.data[t.getIndex(idx)]

 func `[]=`[Rank, T](t: var Tensor[Rank, T], idx: array[Rank, int], val: T) {.inline.}=
  ## Index tensor
  t.storage.data[t.getIndex(idx)] = val

 template `[]`[T: SomeNumber](x: T, idx: varargs[int]): T =
  ## "Index" scalars
  x

 func shape(x: SomeNumber): array[1, int] = [1]

 func bcShape[R1, R2: static[int]](x: array[R1, int]; y: array[R2, int]): auto =
  when R1 > R2:
    result = x
    for i, idx in result.mpairs:
      if idx == 1 and y[i] != 1:
        idx = y[i]
  else:
    result = y
    for i, idx in result.mpairs:
      if idx == 1 and x[i] != 1:
        idx = x[i]

 macro getBroadcastShape(x: varargs[typed]): untyped =
  assert x.len >= 2
  result = nnkDotExpr.newTree(x[0], ident"shape")
  for i in 1 ..< x.len:
    let xi = x[i]
    result = quote do: bcShape(`result`, `xi`.shape)

 func bc[R1, R2: static[int], T](t: Tensor[R1, T], shape: array[R2, int]): Tensor[R2, T] =
  ## Broadcast tensors
  result.shape = shape
  for i in 0 ..< R1:
    if t.shape[i] == 1 and shape[i] != 1:
      result.strides[i] = 0
    else:
      result.strides[i] = t.strides[i]
      if t.shape[i] != result.shape[i]:
        raise newException(ValueError, "The broadcasted size of the tensor must match existing size for non-singleton dimension")
  result.offset = t.offset
  result.storage = t.storage

 func bc[Rank; T: SomeNumber](x: T, shape: array[Rank, int]): T {.inline.}=
  ## "Broadcast" scalars
  x

 func product(x: varargs[int]): int =
  result = 1
  for val in x: result *= val

 proc replaceNodes(ast: NimNode, values: NimNode, containers: NimNode): NimNode =
  # Args:
  #   - The full syntax tree
  #   - an array of replacement value
  #   - an array of identifiers to replace
  proc inspect(node: NimNode): NimNode =
    case node.kind:
    of {nnkIdent, nnkSym}:
      for i, c in containers:
        if node.eqIdent($c):
          return values[i]
      return node
    of nnkEmpty: return node
    of nnkLiterals: return node
    else:
      var rTree = node.kind.newTree()
      for child in node:
        rTree.add inspect(child)
      return rTree
  result = inspect(ast)

 proc pop*(tree: var NimNode): NimNode =
  ## varargs[untyped] consumes all arguments so the actual value should be popped
  ## https://github.com/nim-lang/Nim/issues/5855
  result = tree[tree.len-1]
  tree.del(tree.len-1)

 func nb_elems[N: static[int], T](x: typedesc[array[N, T]]): static[int] =
  N

 macro broadcastImpl(output: untyped, inputs_body: varargs[untyped]): untyped =
  ## If output is empty node it will return a value
  ## otherwise, result will be assigned in-place to output
  let
    in_place = newLit output.kind != nnkEmpty

  var
    inputs = inputs_body
    body = inputs.pop()

  let
    shape = genSym(nskLet, "broadcast_shape__")
    coord = genSym(nskVar, "broadcast_coord__")

  var doBroadcast = newStmtList()
  var bcInputs = nnkArgList.newTree()
  for input in inputs:
    let broadcasted = genSym(nskLet, "broadcast_" & $input & "__")
    doBroadcast.add newLetStmt(
      broadcasted,
      newCall(ident"bc", input, shape)
    )
    bcInputs.add nnkBracketExpr.newTree(broadcasted, coord)

  body = body.replaceNodes(bcInputs, inputs)

  result = quote do:
    block:
      let `shape` = getBroadcastShape(`inputs`)
      const rank = `shape`.type.nb_elems
      var `coord`: array[rank, int] # Current coordinates in the n-dimensional space
      `doBroadcast`

      when not `in_place`:
        var output = newTensor[rank, type(`body`)](`shape`)
      else:
        assert `output`.shape == `shape`
      var counter = 0

      while counter < `shape`.product:
        # Assign for the current iteration
        when not `in_place`:
          output[`coord`] = `body`
        else:
          `output`[`coord`] = `body`

        # Compute the next position
        for k in countdown(rank - 1, 0):
          if `coord`[k] < `shape`[k] - 1:
            `coord`[k] += 1
            break
          else:
            `coord`[k] = 0
        inc counter

      # Now return the value
      when not `in_place`:
        output

 macro broadcast(inputs_body: varargs[untyped]): untyped =
  getAST(broadcastImpl(newEmptyNode(), inputs_body))

 macro materialize(output: var Tensor, inputs_body: varargs[untyped]): untyped =
  getAST(broadcastImpl(output, inputs_body))

 #################################################################################

 import math
 proc sanityChecks() =
  # Sanity checks

  let x = randomTensor([1, 2, 3], 10)
  let y = randomTensor([5, 2], 10)

  echo x # (shape: [1, 2, 3], strides: [6, 3, 1], offset: 0, storage: (data: @[1, 10, 5, 5, 7, 3]))
  echo y # (shape: [5, 2], strides: [2, 1], offset: 0, storage: (data: @[8, 3, 7, 9, 3, 8, 5, 3, 7, 1]))

  block: # Simple assignation
    echo "\nSimple assignation"
    let a = broadcast(x, y):
      x * y

    echo a # (shape: [5, 2, 3], strides: [6, 3, 1], offset: 0, storage: (data: @[8, 80, 40, 15, 21, 9, 7, 70, 35, 45, 63, 27, 3, 30, 15, 40, 56, 24, 5, 50, 25, 15, 21, 9, 7, 70, 35, 5, 7, 3]))

  block: # In-place, similar to Julia impl
    echo "\nIn-place, similar to Julia impl"
    var a = newTensor[3, int]([5, 2, 3])
    materialize(a, x, y):
      x * y

    echo a

  block: # Complex multi statement with type conversion
    echo "\nComplex multi statement with type conversion"
    let a = broadcast(x, y):
      let c = cos x.float64
      let s = sin y.float64

      sqrt(c.pow(2) + s.pow(2))

    echo a # (shape: [5, 2, 3], strides: [6, 3, 1], offset: 0, storage: (data: @[1.12727828058919, 1.297255090978019, 1.029220081237957, 0.3168265963213802, 0.7669963922853442, 0.9999999999999999, 0.8506221091780486, 1.065679324094626, 0.7156085706291233, 0.5003057878335346, 0.859191628789455, 1.072346394223034, 0.5584276483137685, 0.8508559734652587, 0.3168265963213802, 1.029220081237957, 1.243864280886628, 1.399612404734566, 1.100664502137075, 1.274196529364651, 1.0, 0.3168265963213802, 0.7669963922853442, 0.9999999999999999, 0.8506221091780486, 1.065679324094626, 0.7156085706291233, 0.8879964266455946, 1.129797339073468, 1.299291561428286]))

  block: # Variadic number of types with proc declaration inside
    echo "\nVariadic number of types with proc declaration inside"
    var u, v, w, x, y, z = randomTensor([3, 3], 10)

    let c = 2

    let a = broadcast(u, v, w, x, y, z):
      # ((u * v * w) div c) mod (if not zero (x - y + z) else 42)

      proc ifNotZero(val, default: int): int =
        if val == 0: default
        else: val

      let uvw_divc = u * v * w div c
      let xmypz = x - y + z

      uvw_divc mod ifNotZero(xmypz, 42)

    echo a # (shape: [3, 3], strides: [3, 1], offset: 0, storage: (data: @[0, 0, 0, 7, 4, 0, 0, 2, 0]))

 #################################################################################

 import math, random, times, stats, strformat
 proc mainBench(nb_samples: int) =
  ## Bench with standard lib
  block: # Warmup - make sure cpu is on max perf
    let start = cpuTime()
    var foo = 123
    for i in 0 ..< 100_000_000:
      foo += i*i mod 456
      foo = foo mod 789

    # Compiler shouldn't optimize away the results as cpuTime rely on sideeffects
    let stop = cpuTime()
    echo &"Warmup: {stop - start:>4.4f} s, result {foo} (displayed to avoid compiler optimizing warmup away)"

  let
    a = randomTensor([1000, 1000], 1.0)
    b = randomTensor([1000], 1.0)
    c = 1.0
  var output = newTensor[2, float64](a.shape)

  block: # Actual bench
    var stats: RunningStat
    for _ in 0 ..< nb_samples:
      let start = cpuTime()
      materialize(output, a, b, c):
        a + b - sin c
      let stop = cpuTime()
      stats.push stop - start

    echo &"Collected {stats.n} samples"
    echo &"Average broadcast time: {stats.mean * 1000 :>4.3f}ms"
    echo &"Stddev  broadcast time: {stats.standardDeviationS * 1000 :>4.3f}ms"
    echo &"Min     broadcast time: {stats.min * 1000 :>4.3f}ms"
    echo &"Max     broadcast time: {stats.max * 1000 :>4.3f}ms"
    echo "\nDisplay output[[0,0]] to make sure it's not optimized away"
    echo output[[0, 0]]

 proc geometryBench(nb_samples: int) =
  type Point3 = object
    x, y, z: float32

  proc rand(max: Point3): Point3 =
    result.x = rand max.x
    result.y = rand max.y
    result.z = rand max.z

  template liftBinary(op: untyped): untyped =
    func `op`(a, b: Point3): Point3 {.inline.}=
      result.x = `op`(a.x, b.x)
      result.y = `op`(a.y, b.y)
      result.z = `op`(a.z, b.z)
    func `op`(a: Point3, b: float32): Point3 {.inline.}=
      result.x = `op`(a.x, b)
      result.y = `op`(a.y, b)
      result.z = `op`(a.z, b)
  template liftUnary(op: untyped): untyped =
    func `op`(a: Point3): Point3 {.inline.}=
      result.x = `op`(a.x)
      result.y = `op`(a.y)
      result.z = `op`(a.z)

  liftBinary(`+`)
  liftBinary(`-`)
  liftUnary(sin)

  let
    a = randomTensor([1_000_000], Point3(x: 100, y: 100, z: 100))
    b = randomTensor([1_000_000], Point3(x: 100, y: 100, z: 100))
    c = 1.0'f32 # Julia has Point3 has float32 but C has float64
  var output = newTensor[1, Point3](a.shape)

  block: # Actual bench
    var stats: RunningStat
    for _ in 0 ..< nb_samples:
      let start = cpuTime()
      materialize(output, a, b):
        a + b - sin c
      let stop = cpuTime()
      stats.push stop - start

    echo &"Collected {stats.n} samples"
    echo &"Average broadcast time: {stats.mean * 1000 :>4.3f}ms"
    echo &"Stddev  broadcast time: {stats.standardDeviationS * 1000 :>4.3f}ms"
    echo &"Min     broadcast time: {stats.min * 1000 :>4.3f}ms"
    echo &"Max     broadcast time: {stats.max * 1000 :>4.3f}ms"
    echo "\nDisplay output[[0,0]] to make sure it's not optimized away"
    echo output[[0]]


 when isMainModule:
  sanityChecks()
  echo "\n###################"
  echo "Benchmark"
  # {.passC: "-march=native" .} # uncomment to enable full optim (AVX/AVX2, ...)
  # randomize(seed = 0)
  mainBench(1_000)
  geometryBench(1_000)

  # Compile with
  # nim c -d:release nim/nim_sol_mratsim.nim     # for binary only
  # nim c -r -d:release nim/nim_sol_mratsim.nim  # for binary + running
	# MIT License
	# Copyright (c) 2018 Mamy André-Ratsimbazafy

	## This files gives basic tensor library functionality, because yes we can
	import strformat, macros, sequtils, random

	type
	Tensor[Rank: static[int], T] = object
	## Tensor data structure stored on Cpu
	## - ``shape``: Dimensions of the tensor
	## - ``strides``: Numbers of items to skip to get the next item along a dimension.
	## - ``offset``: Offset to get the first item of the tensor. Note: offset can be negative, in particular for slices.
	## - ``storage``: A data storage for the tensor
	## - Rank is part of the type for optimization purposes
	##
	## Warning ⚠:
	## Assignment ```var a = b``` does not copy the data. Data modification on one tensor will be reflected on the other.
	## However modification on metadata (shape, strides or offset) will not affect the other tensor.
	shape: array[Rank, int]
	strides: array[Rank, int]
	offset: int
	storage: CpuStorage[T]

	CpuStorage*{.shallow.}[T] = object
	## Data storage for the tensor, copies are shallow by default
	data*: seq[T]

	template tensor(result: var Tensor, shape: array) =
	result.shape = shape

	var accum = 1
	for i in countdown(Rank - 1, 0):
	result.strides[i] = accum
	accum *= shape[i]

	func newTensor*[Rank: static[int], T](shape: array[Rank, int]): Tensor[Rank, T] =
	tensor(result, shape)
	result.storage.data = newSeq[T](shape.product)

	proc randomTensor*[Rank: static[int], T](shape: array[Rank, int], max: T): Tensor[Rank, T] =
	mixin rand
	tensor(result, shape)
	result.storage.data = newSeqWith(shape.product, T(rand(max)))

	func getIndex[Rank, T](t: Tensor[Rank, T], idx: array[Rank, int]): int {.inline.} =
	## Convert [i, j, k, l ...] to the memory location referred by the index
	result = t.offset
	for i in 0 ..< t.Rank:
	{.unroll.} # I'm sad this doesn't work yet
	result += t.strides[i] * idx[i]

	func `[]`[Rank, T](t: Tensor[Rank, T], idx: array[Rank, int]): T {.inline.}=
	## Index tensor
	t.storage.data[t.getIndex(idx)]

	func `[]=`[Rank, T](t: var Tensor[Rank, T], idx: array[Rank, int], val: T) {.inline.}=
	## Index tensor
	t.storage.data[t.getIndex(idx)] = val

	template `[]`[T: SomeNumber](x: T, idx: varargs[int]): T =
	## "Index" scalars
	x

	func shape(x: SomeNumber): array[1, int] = [1]

	func bcShape[R1, R2: static[int]](x: array[R1, int]; y: array[R2, int]): auto =
	when R1 > R2:
	result = x
	for i, idx in result.mpairs:
	if idx == 1 and y[i] != 1:
	idx = y[i]
	else:
	result = y
	for i, idx in result.mpairs:
	if idx == 1 and x[i] != 1:
	idx = x[i]

	macro getBroadcastShape(x: varargs[typed]): untyped =
	assert x.len >= 2
	result = nnkDotExpr.newTree(x[0], ident"shape")
	for i in 1 ..< x.len:
	let xi = x[i]
	result = quote do: bcShape(`result`, `xi`.shape)

	func bc[R1, R2: static[int], T](t: Tensor[R1, T], shape: array[R2, int]): Tensor[R2, T] =
	## Broadcast tensors
	result.shape = shape
	for i in 0 ..< R1:
	if t.shape[i] == 1 and shape[i] != 1:
	result.strides[i] = 0
	else:
	result.strides[i] = t.strides[i]
	if t.shape[i] != result.shape[i]:
	raise newException(ValueError, "The broadcasted size of the tensor must match existing size for non-singleton dimension")
	result.offset = t.offset
	result.storage = t.storage

	func bc[Rank; T: SomeNumber](x: T, shape: array[Rank, int]): T {.inline.}=
	## "Broadcast" scalars
	x

	func product(x: varargs[int]): int =
	result = 1
	for val in x: result *= val

	proc replaceNodes(ast: NimNode, values: NimNode, containers: NimNode): NimNode =
	# Args:
	# - The full syntax tree
	# - an array of replacement value
	# - an array of identifiers to replace
	proc inspect(node: NimNode): NimNode =
	case node.kind:
	of {nnkIdent, nnkSym}:
	for i, c in containers:
	if node.eqIdent($c):
	return values[i]
	return node
	of nnkEmpty: return node
	of nnkLiterals: return node
	else:
	var rTree = node.kind.newTree()
	for child in node:
	rTree.add inspect(child)
	return rTree
	result = inspect(ast)

	proc pop*(tree: var NimNode): NimNode =
	## varargs[untyped] consumes all arguments so the actual value should be popped
	## https://github.com/nim-lang/Nim/issues/5855
	result = tree[tree.len-1]
	tree.del(tree.len-1)

	func nb_elems[N: static[int], T](x: typedesc[array[N, T]]): static[int] =
	N

	macro broadcastImpl(output: untyped, inputs_body: varargs[untyped]): untyped =
	## If output is empty node it will return a value
	## otherwise, result will be assigned in-place to output
	let
	in_place = newLit output.kind != nnkEmpty

	var
	inputs = inputs_body
	body = inputs.pop()

	let
	shape = genSym(nskLet, "broadcast_shape__")
	coord = genSym(nskVar, "broadcast_coord__")

	var doBroadcast = newStmtList()
	var bcInputs = nnkArgList.newTree()
	for input in inputs:
	let broadcasted = genSym(nskLet, "broadcast_" & $input & "__")
	doBroadcast.add newLetStmt(
	broadcasted,
	newCall(ident"bc", input, shape)
	)
	bcInputs.add nnkBracketExpr.newTree(broadcasted, coord)

	body = body.replaceNodes(bcInputs, inputs)

	result = quote do:
	block:
	let `shape` = getBroadcastShape(`inputs`)
	const rank = `shape`.type.nb_elems
	var `coord`: array[rank, int] # Current coordinates in the n-dimensional space
	`doBroadcast`

	when not `in_place`:
	var output = newTensor[rank, type(`body`)](`shape`)
	else:
	assert `output`.shape == `shape`
	var counter = 0

	while counter < `shape`.product:
	# Assign for the current iteration
	when not `in_place`:
	output[`coord`] = `body`
	else:
	`output`[`coord`] = `body`

	# Compute the next position
	for k in countdown(rank - 1, 0):
	if `coord`[k] < `shape`[k] - 1:
	`coord`[k] += 1
	break
	else:
	`coord`[k] = 0
	inc counter

	# Now return the value
	when not `in_place`:
	output

	macro broadcast(inputs_body: varargs[untyped]): untyped =
	getAST(broadcastImpl(newEmptyNode(), inputs_body))

	macro materialize(output: var Tensor, inputs_body: varargs[untyped]): untyped =
	getAST(broadcastImpl(output, inputs_body))

	#################################################################################

	import math
	proc sanityChecks() =
	# Sanity checks

	let x = randomTensor([1, 2, 3], 10)
	let y = randomTensor([5, 2], 10)

	echo x # (shape: [1, 2, 3], strides: [6, 3, 1], offset: 0, storage: (data: @[1, 10, 5, 5, 7, 3]))
	echo y # (shape: [5, 2], strides: [2, 1], offset: 0, storage: (data: @[8, 3, 7, 9, 3, 8, 5, 3, 7, 1]))

	block: # Simple assignation
	echo "\nSimple assignation"
	let a = broadcast(x, y):
	x * y

	echo a # (shape: [5, 2, 3], strides: [6, 3, 1], offset: 0, storage: (data: @[8, 80, 40, 15, 21, 9, 7, 70, 35, 45, 63, 27, 3, 30, 15, 40, 56, 24, 5, 50, 25, 15, 21, 9, 7, 70, 35, 5, 7, 3]))

	block: # In-place, similar to Julia impl
	echo "\nIn-place, similar to Julia impl"
	var a = newTensor[3, int]([5, 2, 3])
	materialize(a, x, y):
	x * y

	echo a

	block: # Complex multi statement with type conversion
	echo "\nComplex multi statement with type conversion"
	let a = broadcast(x, y):
	let c = cos x.float64
	let s = sin y.float64

	sqrt(c.pow(2) + s.pow(2))

	echo a # (shape: [5, 2, 3], strides: [6, 3, 1], offset: 0, storage: (data: @[1.12727828058919, 1.297255090978019, 1.029220081237957, 0.3168265963213802, 0.7669963922853442, 0.9999999999999999, 0.8506221091780486, 1.065679324094626, 0.7156085706291233, 0.5003057878335346, 0.859191628789455, 1.072346394223034, 0.5584276483137685, 0.8508559734652587, 0.3168265963213802, 1.029220081237957, 1.243864280886628, 1.399612404734566, 1.100664502137075, 1.274196529364651, 1.0, 0.3168265963213802, 0.7669963922853442, 0.9999999999999999, 0.8506221091780486, 1.065679324094626, 0.7156085706291233, 0.8879964266455946, 1.129797339073468, 1.299291561428286]))

	block: # Variadic number of types with proc declaration inside
	echo "\nVariadic number of types with proc declaration inside"
	var u, v, w, x, y, z = randomTensor([3, 3], 10)

	let c = 2

	let a = broadcast(u, v, w, x, y, z):
	# ((u * v * w) div c) mod (if not zero (x - y + z) else 42)

	proc ifNotZero(val, default: int): int =
	if val == 0: default
	else: val

	let uvw_divc = u * v * w div c
	let xmypz = x - y + z

	uvw_divc mod ifNotZero(xmypz, 42)

	echo a # (shape: [3, 3], strides: [3, 1], offset: 0, storage: (data: @[0, 0, 0, 7, 4, 0, 0, 2, 0]))

	#################################################################################

	import math, random, times, stats, strformat
	proc mainBench(nb_samples: int) =
	## Bench with standard lib
	block: # Warmup - make sure cpu is on max perf
	let start = cpuTime()
	var foo = 123
	for i in 0 ..< 100_000_000:
	foo += i*i mod 456
	foo = foo mod 789

	# Compiler shouldn't optimize away the results as cpuTime rely on sideeffects
	let stop = cpuTime()
	echo &"Warmup: {stop - start:>4.4f} s, result {foo} (displayed to avoid compiler optimizing warmup away)"

	let
	a = randomTensor([1000, 1000], 1.0)
	b = randomTensor([1000], 1.0)
	c = 1.0
	var output = newTensor[2, float64](a.shape)

	block: # Actual bench
	var stats: RunningStat
	for _ in 0 ..< nb_samples:
	let start = cpuTime()
	materialize(output, a, b, c):
	a + b - sin c
	let stop = cpuTime()
	stats.push stop - start

	echo &"Collected {stats.n} samples"
	echo &"Average broadcast time: {stats.mean * 1000 :>4.3f}ms"
	echo &"Stddev broadcast time: {stats.standardDeviationS * 1000 :>4.3f}ms"
	echo &"Min broadcast time: {stats.min * 1000 :>4.3f}ms"
	echo &"Max broadcast time: {stats.max * 1000 :>4.3f}ms"
	echo "\nDisplay output[[0,0]] to make sure it's not optimized away"
	echo output[[0, 0]]

	proc geometryBench(nb_samples: int) =
	type Point3 = object
	x, y, z: float32

	proc rand(max: Point3): Point3 =
	result.x = rand max.x
	result.y = rand max.y
	result.z = rand max.z

	template liftBinary(op: untyped): untyped =
	func `op`(a, b: Point3): Point3 {.inline.}=
	result.x = `op`(a.x, b.x)
	result.y = `op`(a.y, b.y)
	result.z = `op`(a.z, b.z)
	func `op`(a: Point3, b: float32): Point3 {.inline.}=
	result.x = `op`(a.x, b)
	result.y = `op`(a.y, b)
	result.z = `op`(a.z, b)
	template liftUnary(op: untyped): untyped =
	func `op`(a: Point3): Point3 {.inline.}=
	result.x = `op`(a.x)
	result.y = `op`(a.y)
	result.z = `op`(a.z)

	liftBinary(`+`)
	liftBinary(`-`)
	liftUnary(sin)

	let
	a = randomTensor([1_000_000], Point3(x: 100, y: 100, z: 100))
	b = randomTensor([1_000_000], Point3(x: 100, y: 100, z: 100))
	c = 1.0'f32 # Julia has Point3 has float32 but C has float64
	var output = newTensor[1, Point3](a.shape)

	block: # Actual bench
	var stats: RunningStat
	for _ in 0 ..< nb_samples:
	let start = cpuTime()
	materialize(output, a, b):
	a + b - sin c
	let stop = cpuTime()
	stats.push stop - start

	echo &"Collected {stats.n} samples"
	echo &"Average broadcast time: {stats.mean * 1000 :>4.3f}ms"
	echo &"Stddev broadcast time: {stats.standardDeviationS * 1000 :>4.3f}ms"
	echo &"Min broadcast time: {stats.min * 1000 :>4.3f}ms"
	echo &"Max broadcast time: {stats.max * 1000 :>4.3f}ms"
	echo "\nDisplay output[[0,0]] to make sure it's not optimized away"
	echo output[[0]]


	when isMainModule:
	sanityChecks()
	echo "\n###################"
	echo "Benchmark"
	# {.passC: "-march=native" .} # uncomment to enable full optim (AVX/AVX2, ...)
	# randomize(seed = 0)
	mainBench(1_000)
	geometryBench(1_000)

	# Compile with
	# nim c -d:release nim/nim_sol_mratsim.nim # for binary only
	# nim c -r -d:release nim/nim_sol_mratsim.nim # for binary + running