mratsim · August 23, 2024 16:08
diff --git a/demotomohiro_gcd.nim b/demotomohiro_gcd.nim
 import std/[cmdline, times, monotimes, random, sugar, os, typetraits]

 const
  EnableTests = false
  EnableCount = not EnableTests and false
  EnableBench = not EnableTests

 template bench(repeat: int; init, body: untyped): untyped =
  var minTime = initDuration(days = 2)
  for i in 1 .. repeat:
    init
    let start = getMonoTime()
    body
    let finish = getMonoTime()
    minTime = min(minTime, finish - start)
    # Prevent thermal throttling.
    sleep(20)
  echo minTime.inMicroseconds, " micro second"
  sleep(2000)

 when EnableCount:
  var gcdCount = 0

 proc gcd*[T](x, y: T): T =
  # gcd from math module in Nim's stdlib.

  var (x, y) = (x, y)
  while y != 0:
    x = x mod y
    swap x, y
    when EnableCount:
      inc gcdCount
  abs x

 when EnableCount:
  var gcdLARCount = 0

 proc gcdLAR*[T: SomeSignedInt](x, y: T): T =
  # https://en.wikipedia.org/wiki/Euclidean_algorithm#Method_of_least_absolute_remainders
  # Reduce the number of iterations by using smaller absolute remainders

  # x and y are always positive
  var (x, y) = (x.abs, y.abs)
  while y != 0:
    #debugEcho x, " mod ", y
    x = x mod y
    if x > (y shr 1):
      # As y is a multiple of the gcd, you can add or subtract y to x
      # You can multiply -1 to x as -x is still multiple of the gcd.
      x = y - x
    swap x, y
    when EnableCount:
      inc gcdLARCount
  abs x

 when EnableCount:
  var gcdLAR2Count = 0

 proc gcdLAR2*[T: SomeSignedInt](x, y: T): T =
  # gcdLAR with many branches.

  var (x, y) = (x, y)
  # x and y can become negative
  while y != 0:
    #debugEcho x, " mod ", y
    if x > 0:
      x = x mod y
      if y > 0:
        if x > y div 2:
          dec x, y
      else:
        if x > -y div 2:
          inc x, y
    else:
      x = x mod y
      if y > 0:
        if x < -y div 2:
          inc x, y
      else:
        if x < y div 2:
          dec x, y
    swap x, y
    when EnableCount:
      inc gcdLAR2Count
  abs x

 when EnableCount:
  var gcdLAR3Count = 0

 proc gcdLAR3*[T: SomeInteger](x, y: T): T =
  type UT = T.toUnsigned
  # x and y are always positive
  var (x, y) = (x.abs, y.abs)
  while y != 0:
    # Make x mod y returns -y/2 < r <= y/2
    let z = (y - 1) div 2
    # x + z can overflow. So convert them to unsigned.
    x = abs(((x.UT + z.UT) mod y.UT).int - z)
    swap x, y
    when EnableCount:
      inc gcdLAR3Count
  abs x

 when EnableCount:
  var gcdLAR4Count = 0

 proc gcdLAR4*[T](x, y: T): T =
  # x and y are always positive
  var (x, y) = (x.abs, y.abs)
  while y != 0:
    #debugEcho x, " mod ", y
    x = x mod y
    # As y is a multiple of the gcd, you can add or subtract y to x
    # You can multiply -1 to x as -x is still multiple of the gcd.
    x = min(x, y - x)
    swap x, y
    when EnableCount:
      inc gcdLAR4Count
  abs x

 import std/bitops

 when EnableCount:
  var gcdSubCount = 0

 proc gcdSub*[T: SomeInteger](x, y: T): T =
  # Same to gcd in https://github.com/nim-lang/bigints
  var
    (x, y) = (x.abs, y.abs)
  let
    i = countTrailingZeroBits(x)
    j = countTrailingZeroBits(y)
  let lg2 = min(i, j)
  x = x shr i
  y = y shr j
  # Both x and y are odd now

  while true:
    if y > x:
      swap x, y

    x -= y
    if x == 0:
      return y shl lg2
    # odd - odd = even
    x = x shr countTrailingZeroBits(x)

    when EnableCount:
      inc gcdSubCount

 when EnableCount:
  var gcdSub2Count = 0

 proc gcdSub2*[T: SomeInteger](x, y: T): T =
  # Same to gcdSub excepts using `mod` instead of subtraction
  # to reduce the number of iterations.
  var
    (x, y) = (x.abs, y.abs)
  let
    i = countTrailingZeroBits(x)
    j = countTrailingZeroBits(y)
  let lg2 = min(i, j)
  x = x shr i
  y = y shr j
  if y > x:
    swap x, y

  while true:
    x = x mod y
    if x == 0:
      return y shl lg2
    x = x shr countTrailingZeroBits(x)
    swap x, y

    when EnableCount:
      inc gcdSub2Count

 func gcd_impulse(u, v: int): int =
  # From https://github.com/SciNim/impulse/blob/49b813232507470a047727712acda105b84c7815/impulse/fft/factorization.nim#L23-L39
  if u == 0: return v
  if v == 0: return u
  let shift = countTrailingZeroBits(u or v)
  var u = u shr u.countTrailingZeroBits()
  var v = v
  while true:
    v = v shr v.countTrailingZeroBits()
    if u > v:
      swap(u, v)
    v = v - u
    if v == 0:
      return u shl shift


 when EnableTests:
  template test(gcdProc: typed) =
    doAssert gcdProc(1, 1) == 1
    doAssert gcdProc(2, 1) == 1
    doAssert gcdProc(3, 1) == 1
    doAssert gcdProc(2, 2) == 2
    doAssert gcdProc(3, 2) == 1
    doAssert gcdProc(4, 2) == 2
    doAssert gcdProc(5, 2) == 1
    doAssert gcdProc(3, 3) == 3
    doAssert gcdProc(4, 3) == 1
    doAssert gcdProc(5, 3) == 1
    doAssert gcdProc(6, 3) == 3
    doAssert gcdProc(10, 6) == 2
    doAssert gcdProc(60, 33) == 3
    doAssert gcdProc(121, 11) == 11
    doAssert gcdProc(1024 * 13, 1024 * 17) == 1024
    doAssert gcdProc(122, 11) == 1
    doAssert gcdProc(104, 66) == 2
    doAssert gcdProc(8191 * 2, 3 * 2) == 2
    doAssert gcdProc(8191 * 7, 15 * 7) == 7
    doAssert gcdProc(131071, 8191) == 1
    doAssert gcdProc(131071 * 5 * 3 * 3, 127 * 11 * 7 * 3) == 3
    doAssert gcdProc(131071 * 11 * 7 * 5 * 2, 127 * 19 * 17 * 13 * 2) == 2
    doAssert gcdProc(131071 * 11 * 7 * 5 * 7, 127 * 19 * 17 * 13 * 7) == 7
    doAssert gcdProc(131071 * 11 * 7 * 5 * 8191, 127 * 19 * 17 * 13 * 8191) == 8191
    doAssert gcdProc(int.high, int.high) == int.high
    doAssert gcdProc(int.high, int.high - 1) == 1
    doAssert gcdProc(int.high - 1, int.high - 3) == 2
    doAssert gcdProc(int.high, int.high div 2) == 1

  test(gcdLAR)
  test(gcdLAR2)
  test(gcdLAR3)
  test(gcdLAR4)
  test(gcdSub)
  test(gcdSub2)
  test(gcd_impulse)

 when EnableBench:
  proc runBench =
    let seed = paramCount() or 123
    var
      rand: Rand
      res0 = 0

    stdout.write "gcd in stdlib: "
    bench(101):
      rand = seed.initRand()
    do:
      for i in 0 .. 10000:
        var (x, y) = (rand.rand(1 .. int.high), rand.rand(1 .. int.high))
        if x < y:
          swap x, y
        res0 = res0 xor gcd(x, y)

    var res1 = 0
    stdout.write "gcdLAR:        "
    bench(101):
      rand = seed.initRand()
    do:
      for i in 0 .. 10000:
        var (x, y) = (rand.rand(1 .. int.high), rand.rand(1 .. int.high))
        if x < y:
          swap x, y
        res1 = res1 xor gcdLAR(x, y)

    var res2 = 0
    stdout.write "gcdLAR2:       "
    bench(101):
      rand = seed.initRand()
    do:
      for i in 0 .. 10000:
        var (x, y) = (rand.rand(1 .. int.high), rand.rand(1 .. int.high))
        if x < y:
          swap x, y
        res2 = res2 xor gcdLAR2(x, y)

    var res3 = 0
    stdout.write "gcdLAR3:       "
    bench(101):
      rand = seed.initRand()
    do:
      for i in 0 .. 10000:
        var (x, y) = (rand.rand(1 .. int.high), rand.rand(1 .. int.high))
        if x < y:
          swap x, y
        res3 = res3 xor gcdLAR3(x, y)

    var res4 = 0
    stdout.write "gcdLAR4:       "
    bench(101):
      rand = seed.initRand()
    do:
      for i in 0 .. 10000:
        var (x, y) = (rand.rand(1 .. int.high), rand.rand(1 .. int.high))
        if x < y:
          swap x, y
        res4 = res4 xor gcdLAR4(x, y)

    var res5 = 0
    stdout.write "gcdSub:        "
    bench(101):
      rand = seed.initRand()
    do:
      for i in 0 .. 10000:
        var (x, y) = (rand.rand(1 .. int.high), rand.rand(1 .. int.high))
        if x < y:
          swap x, y
        res5 = res5 xor gcdSub(x, y)

    var res6 = 0
    stdout.write "gcdSub2:       "
    bench(101):
      rand = seed.initRand()
    do:
      for i in 0 .. 10000:
        var (x, y) = (rand.rand(1 .. int.high), rand.rand(1 .. int.high))
        if x < y:
          swap x, y
        res6 = res6 xor gcdSub2(x, y)

    var res7 = 0
    stdout.write "gcd_impulse:       "
    bench(101):
      rand = seed.initRand()
    do:
      for i in 0 .. 10000:
        var (x, y) = (rand.rand(1 .. int.high), rand.rand(1 .. int.high))
        if x < y:
          swap x, y
        res7 = res7 xor gcd_impulse(x, y)

    when EnableCount:
      dump gcdCount
      dump gcdLARCount
      dump gcdLAR2Count
      dump gcdLAR3Count
      dump gcdLAR4Count
      dump gcdSubCount
      dump gcdSub2Count
      dump gcd_impulse
    echo res0, res1, res2, res3, res4, res5, res6, res7
    doAssert res0 == res1 and res0 == res2 and res0 == res3 and res0 == res4 and res0 == res5 and res0 == res6 and res0 == res7

  runBench()
	import std/[cmdline, times, monotimes, random, sugar, os, typetraits]

	const
	EnableTests = false
	EnableCount = not EnableTests and false
	EnableBench = not EnableTests

	template bench(repeat: int; init, body: untyped): untyped =
	var minTime = initDuration(days = 2)
	for i in 1 .. repeat:
	init
	let start = getMonoTime()
	body
	let finish = getMonoTime()
	minTime = min(minTime, finish - start)
	# Prevent thermal throttling.
	sleep(20)
	echo minTime.inMicroseconds, " micro second"
	sleep(2000)

	when EnableCount:
	var gcdCount = 0

	proc gcd*[T](x, y: T): T =
	# gcd from math module in Nim's stdlib.

	var (x, y) = (x, y)
	while y != 0:
	x = x mod y
	swap x, y
	when EnableCount:
	inc gcdCount
	abs x

	when EnableCount:
	var gcdLARCount = 0

	proc gcdLAR*[T: SomeSignedInt](x, y: T): T =
	# https://en.wikipedia.org/wiki/Euclidean_algorithm#Method_of_least_absolute_remainders
	# Reduce the number of iterations by using smaller absolute remainders

	# x and y are always positive
	var (x, y) = (x.abs, y.abs)
	while y != 0:
	#debugEcho x, " mod ", y
	x = x mod y
	if x > (y shr 1):
	# As y is a multiple of the gcd, you can add or subtract y to x
	# You can multiply -1 to x as -x is still multiple of the gcd.
	x = y - x
	swap x, y
	when EnableCount:
	inc gcdLARCount
	abs x

	when EnableCount:
	var gcdLAR2Count = 0

	proc gcdLAR2*[T: SomeSignedInt](x, y: T): T =
	# gcdLAR with many branches.

	var (x, y) = (x, y)
	# x and y can become negative
	while y != 0:
	#debugEcho x, " mod ", y
	if x > 0:
	x = x mod y
	if y > 0:
	if x > y div 2:
	dec x, y
	else:
	if x > -y div 2:
	inc x, y
	else:
	x = x mod y
	if y > 0:
	if x < -y div 2:
	inc x, y
	else:
	if x < y div 2:
	dec x, y
	swap x, y
	when EnableCount:
	inc gcdLAR2Count
	abs x

	when EnableCount:
	var gcdLAR3Count = 0

	proc gcdLAR3*[T: SomeInteger](x, y: T): T =
	type UT = T.toUnsigned
	# x and y are always positive
	var (x, y) = (x.abs, y.abs)
	while y != 0:
	# Make x mod y returns -y/2 < r <= y/2
	let z = (y - 1) div 2
	# x + z can overflow. So convert them to unsigned.
	x = abs(((x.UT + z.UT) mod y.UT).int - z)
	swap x, y
	when EnableCount:
	inc gcdLAR3Count
	abs x

	when EnableCount:
	var gcdLAR4Count = 0

	proc gcdLAR4*[T](x, y: T): T =
	# x and y are always positive
	var (x, y) = (x.abs, y.abs)
	while y != 0:
	#debugEcho x, " mod ", y
	x = x mod y
	# As y is a multiple of the gcd, you can add or subtract y to x
	# You can multiply -1 to x as -x is still multiple of the gcd.
	x = min(x, y - x)
	swap x, y
	when EnableCount:
	inc gcdLAR4Count
	abs x

	import std/bitops

	when EnableCount:
	var gcdSubCount = 0

	proc gcdSub*[T: SomeInteger](x, y: T): T =
	# Same to gcd in https://github.com/nim-lang/bigints
	var
	(x, y) = (x.abs, y.abs)
	let
	i = countTrailingZeroBits(x)
	j = countTrailingZeroBits(y)
	let lg2 = min(i, j)
	x = x shr i
	y = y shr j
	# Both x and y are odd now

	while true:
	if y > x:
	swap x, y

	x -= y
	if x == 0:
	return y shl lg2
	# odd - odd = even
	x = x shr countTrailingZeroBits(x)

	when EnableCount:
	inc gcdSubCount

	when EnableCount:
	var gcdSub2Count = 0

	proc gcdSub2*[T: SomeInteger](x, y: T): T =
	# Same to gcdSub excepts using `mod` instead of subtraction
	# to reduce the number of iterations.
	var
	(x, y) = (x.abs, y.abs)
	let
	i = countTrailingZeroBits(x)
	j = countTrailingZeroBits(y)
	let lg2 = min(i, j)
	x = x shr i
	y = y shr j
	if y > x:
	swap x, y

	while true:
	x = x mod y
	if x == 0:
	return y shl lg2
	x = x shr countTrailingZeroBits(x)
	swap x, y

	when EnableCount:
	inc gcdSub2Count

	func gcd_impulse(u, v: int): int =
	# From https://github.com/SciNim/impulse/blob/49b813232507470a047727712acda105b84c7815/impulse/fft/factorization.nim#L23-L39
	if u == 0: return v
	if v == 0: return u
	let shift = countTrailingZeroBits(u or v)
	var u = u shr u.countTrailingZeroBits()
	var v = v
	while true:
	v = v shr v.countTrailingZeroBits()
	if u > v:
	swap(u, v)
	v = v - u
	if v == 0:
	return u shl shift


	when EnableTests:
	template test(gcdProc: typed) =
	doAssert gcdProc(1, 1) == 1
	doAssert gcdProc(2, 1) == 1
	doAssert gcdProc(3, 1) == 1
	doAssert gcdProc(2, 2) == 2
	doAssert gcdProc(3, 2) == 1
	doAssert gcdProc(4, 2) == 2
	doAssert gcdProc(5, 2) == 1
	doAssert gcdProc(3, 3) == 3
	doAssert gcdProc(4, 3) == 1
	doAssert gcdProc(5, 3) == 1
	doAssert gcdProc(6, 3) == 3
	doAssert gcdProc(10, 6) == 2
	doAssert gcdProc(60, 33) == 3
	doAssert gcdProc(121, 11) == 11
	doAssert gcdProc(1024 * 13, 1024 * 17) == 1024
	doAssert gcdProc(122, 11) == 1
	doAssert gcdProc(104, 66) == 2
	doAssert gcdProc(8191 * 2, 3 * 2) == 2
	doAssert gcdProc(8191 * 7, 15 * 7) == 7
	doAssert gcdProc(131071, 8191) == 1
	doAssert gcdProc(131071 * 5 * 3 * 3, 127 * 11 * 7 * 3) == 3
	doAssert gcdProc(131071 * 11 * 7 * 5 * 2, 127 * 19 * 17 * 13 * 2) == 2
	doAssert gcdProc(131071 * 11 * 7 * 5 * 7, 127 * 19 * 17 * 13 * 7) == 7
	doAssert gcdProc(131071 * 11 * 7 * 5 * 8191, 127 * 19 * 17 * 13 * 8191) == 8191
	doAssert gcdProc(int.high, int.high) == int.high
	doAssert gcdProc(int.high, int.high - 1) == 1
	doAssert gcdProc(int.high - 1, int.high - 3) == 2
	doAssert gcdProc(int.high, int.high div 2) == 1

	test(gcdLAR)
	test(gcdLAR2)
	test(gcdLAR3)
	test(gcdLAR4)
	test(gcdSub)
	test(gcdSub2)
	test(gcd_impulse)

	when EnableBench:
	proc runBench =
	let seed = paramCount() or 123
	var
	rand: Rand
	res0 = 0

	stdout.write "gcd in stdlib: "
	bench(101):
	rand = seed.initRand()
	do:
	for i in 0 .. 10000:
	var (x, y) = (rand.rand(1 .. int.high), rand.rand(1 .. int.high))
	if x < y:
	swap x, y
	res0 = res0 xor gcd(x, y)

	var res1 = 0
	stdout.write "gcdLAR: "
	bench(101):
	rand = seed.initRand()
	do:
	for i in 0 .. 10000:
	var (x, y) = (rand.rand(1 .. int.high), rand.rand(1 .. int.high))
	if x < y:
	swap x, y
	res1 = res1 xor gcdLAR(x, y)

	var res2 = 0
	stdout.write "gcdLAR2: "
	bench(101):
	rand = seed.initRand()
	do:
	for i in 0 .. 10000:
	var (x, y) = (rand.rand(1 .. int.high), rand.rand(1 .. int.high))
	if x < y:
	swap x, y
	res2 = res2 xor gcdLAR2(x, y)

	var res3 = 0
	stdout.write "gcdLAR3: "
	bench(101):
	rand = seed.initRand()
	do:
	for i in 0 .. 10000:
	var (x, y) = (rand.rand(1 .. int.high), rand.rand(1 .. int.high))
	if x < y:
	swap x, y
	res3 = res3 xor gcdLAR3(x, y)

	var res4 = 0
	stdout.write "gcdLAR4: "
	bench(101):
	rand = seed.initRand()
	do:
	for i in 0 .. 10000:
	var (x, y) = (rand.rand(1 .. int.high), rand.rand(1 .. int.high))
	if x < y:
	swap x, y
	res4 = res4 xor gcdLAR4(x, y)

	var res5 = 0
	stdout.write "gcdSub: "
	bench(101):
	rand = seed.initRand()
	do:
	for i in 0 .. 10000:
	var (x, y) = (rand.rand(1 .. int.high), rand.rand(1 .. int.high))
	if x < y:
	swap x, y
	res5 = res5 xor gcdSub(x, y)

	var res6 = 0
	stdout.write "gcdSub2: "
	bench(101):
	rand = seed.initRand()
	do:
	for i in 0 .. 10000:
	var (x, y) = (rand.rand(1 .. int.high), rand.rand(1 .. int.high))
	if x < y:
	swap x, y
	res6 = res6 xor gcdSub2(x, y)

	var res7 = 0
	stdout.write "gcd_impulse: "
	bench(101):
	rand = seed.initRand()
	do:
	for i in 0 .. 10000:
	var (x, y) = (rand.rand(1 .. int.high), rand.rand(1 .. int.high))
	if x < y:
	swap x, y
	res7 = res7 xor gcd_impulse(x, y)

	when EnableCount:
	dump gcdCount
	dump gcdLARCount
	dump gcdLAR2Count
	dump gcdLAR3Count
	dump gcdLAR4Count
	dump gcdSubCount
	dump gcdSub2Count
	dump gcd_impulse
	echo res0, res1, res2, res3, res4, res5, res6, res7
	doAssert res0 == res1 and res0 == res2 and res0 == res3 and res0 == res4 and res0 == res5 and res0 == res6 and res0 == res7

	runBench()