mratsim · December 9, 2020 11:28
diff --git a/divide_conquer_parallel_pairings.nim b/divide_conquer_parallel_pairings.nim

 func dacPairing[HashLen](
       sigsets: ptr UncheckedArray[SignatureSet[HashLen]],
       contexts: ptr UncheckedArray[ContextMultiAggregateVerify[DST]],
       numBatches: uint32,
       batchID: uint32,
       subsetStart: uint32,
       subsetStopEx: uint32
     ): bool =
  ## Distribute pairing computation using a recursive divide-and-conquer (DAC) algorithm
  ##
  ## Returns false if at least one partial pairing `update` failed.
  ## ⚠️ : We use 0-based indexing for the tree splitting
  ##                  root at 0    root at 1
  ## Left child        ix*2 + 1     ix*2
  ## Right child       ix*2 + 2     ix*2 + 1
  ## Parent            (ix-1)/2     ix/2
  #
  # Rationale for divide and conquer.
  # Parallel pairing computation requires the following steps
  #
  # Assuming we have N (public key, message, signature) triplets to verify
  # on P processor/threads.
  # We want B batches with B = P
  # Each processing W work items with W = N/B or N/B + 1
  #
  # Step 0: Initialize a context per parallel batch.
  # Step 1: Compute partial pairings, W work items per thread.
  # Step 2: Merge the B partial pairings
  #
  # For step 2 we have 2 strategies.
  # Strategy A: a simple linear merge
  # ```
  # for i in 1 ..< N:
  #   contexts[0].merge(contexts[i])
  # ```
  # which requires B operations.
  # In that case we can get away with just a simple parallel for loop.
  # and a serial linear merge for step 2
  #
  # Strategy B: A divide-and-conquer algorithm
  # We binary split the merge until we hit the base case:
  # ```
  # contexts[i].merge(contexts[i+1])
  # ```
  #
  # As pairing merge (Fp12 multiplication) is costly
  # (~10000 CPU cycles on Skylake-X with ADCX/ADOX instructions)
  # and for Ethereum we would at least have 6 sets:
  # - block proposals signatures
  # - randao reveal signatures
  # - proposer slashings signatures
  # - attester slashings signatures
  # - attestations signatures
  # - validator exits signatures
  # not counting deposits signatures which may be invalid
  # The merging would be 60k cycles if linear
  # or 10k * log2(6) = 30k cycles if divide-and-conquer on 6+ cores
  # Note that as the tree processing progresses, less threads are required
  # for full parallelism so even with less than 6 cores, the speedup should be important.
  #
  # Hence we prefer strategy B. It would also be easier to port to a simple threadpool
  # which doesn't support parallel for loop but do support plain task spawning.
  #
  # That said a pairing is about 3400k cycles so the optimization is only noticeable
  # when we do multi-block batches,
  # for example batching 20 blocks would require 1200k cycles for a linear merge.
  #
  # Since we use divide-and-conquer for step 2, we might as well use it for step 1 as well
  # which would also allow us to remove synchronization barriers between step 1 and 2.
  # - 1 for the master thread to check ctx.update(...) results and early return
  # - 1 to ensure that the contextPtr array is fully updated before starting merge
  #
  # Note: Skylake-X is a very recent family, with bigint instructions MULX/ADCX/ADOX,
  # multiply everything by 2~3 on a Raspberry Pi
  # and scale by core frequency.
  # 3000 cycles is 1ms at 3GHz.

  # Overflow check: numBatches and so batchID*2+2 are capped by number of threads
  template left(batchID: uint32): uint32 = 2*batchID+1
  template right(batchID: uint32): uint32 = 2*batchID+2

  let mid = (subsetStopEx+subsetStart) shr 1

  if subsetStopEx-subsetStart <= 1 or batchID.left >= numBatches:
    # Can't split or no need to split anymore
    # as there is more work than hardware threads/batches
    # and it's better to accumulate as much as possible in partial pairings
    # as merge cost is non-trivial (10k to 20k cycles).
    # So we want 1 merge per thread and no more.
    let ok = accumPairingLines(sigsets, contexts,
                               batchID, subsetStart, subsetStopEx)
    if not ok:
      return false
    return true
  elif batchID.right >= numBatches:
    # No need to split, there is no right subtree
    # We directly accumulate in the current batch context
    let ok = accumPairingLines(sigsets, contexts,
                               batchID, subsetStart, mid)
    if not ok:
      return false
    return true
  else:
    # Split in half, distribute the right and work on the left
    var leftOk{.exportc.}, rightOk{.exportc.} = false
    omp_task"shared(rightOk)":
      rightOk = dacPairing(sigsets, contexts, numBatches,
                           batchID.right(), mid, subsetStopEx)
    leftOk = dacPairing(sigsets, contexts, numBatches,
                        batchID.left(), subsetStart, mid)

    # Wait for all subtrees
    omp_taskwait()
    if not leftOk or not rightOk:
      return false
    # Merge left and right
    let mergeOk = contexts[batchID.left()].merge(contexts[batchID.right()])
    if not mergeOk:
      return false
    # Update our own pairing context - 3+MB copy
    contexts[batchID] = contexts[batchID.left()]
    return true

 {.pop.}

	func dacPairing[HashLen](
	sigsets: ptr UncheckedArray[SignatureSet[HashLen]],
	contexts: ptr UncheckedArray[ContextMultiAggregateVerify[DST]],
	numBatches: uint32,
	batchID: uint32,
	subsetStart: uint32,
	subsetStopEx: uint32
	): bool =
	## Distribute pairing computation using a recursive divide-and-conquer (DAC) algorithm
	##
	## Returns false if at least one partial pairing `update` failed.
	## ⚠️ : We use 0-based indexing for the tree splitting
	## root at 0 root at 1
	## Left child ix2 + 1 ix2
	## Right child ix2 + 2 ix2 + 1
	## Parent (ix-1)/2 ix/2
	#
	# Rationale for divide and conquer.
	# Parallel pairing computation requires the following steps
	#
	# Assuming we have N (public key, message, signature) triplets to verify
	# on P processor/threads.
	# We want B batches with B = P
	# Each processing W work items with W = N/B or N/B + 1
	#
	# Step 0: Initialize a context per parallel batch.
	# Step 1: Compute partial pairings, W work items per thread.
	# Step 2: Merge the B partial pairings
	#
	# For step 2 we have 2 strategies.
	# Strategy A: a simple linear merge
	# ```
	# for i in 1 ..< N:
	# contexts[0].merge(contexts[i])
	# ```
	# which requires B operations.
	# In that case we can get away with just a simple parallel for loop.
	# and a serial linear merge for step 2
	#
	# Strategy B: A divide-and-conquer algorithm
	# We binary split the merge until we hit the base case:
	# ```
	# contexts[i].merge(contexts[i+1])
	# ```
	#
	# As pairing merge (Fp12 multiplication) is costly
	# (~10000 CPU cycles on Skylake-X with ADCX/ADOX instructions)
	# and for Ethereum we would at least have 6 sets:
	# - block proposals signatures
	# - randao reveal signatures
	# - proposer slashings signatures
	# - attester slashings signatures
	# - attestations signatures
	# - validator exits signatures
	# not counting deposits signatures which may be invalid
	# The merging would be 60k cycles if linear
	# or 10k * log2(6) = 30k cycles if divide-and-conquer on 6+ cores
	# Note that as the tree processing progresses, less threads are required
	# for full parallelism so even with less than 6 cores, the speedup should be important.
	#
	# Hence we prefer strategy B. It would also be easier to port to a simple threadpool
	# which doesn't support parallel for loop but do support plain task spawning.
	#
	# That said a pairing is about 3400k cycles so the optimization is only noticeable
	# when we do multi-block batches,
	# for example batching 20 blocks would require 1200k cycles for a linear merge.
	#
	# Since we use divide-and-conquer for step 2, we might as well use it for step 1 as well
	# which would also allow us to remove synchronization barriers between step 1 and 2.
	# - 1 for the master thread to check ctx.update(...) results and early return
	# - 1 to ensure that the contextPtr array is fully updated before starting merge
	#
	# Note: Skylake-X is a very recent family, with bigint instructions MULX/ADCX/ADOX,
	# multiply everything by 2~3 on a Raspberry Pi
	# and scale by core frequency.
	# 3000 cycles is 1ms at 3GHz.

	# Overflow check: numBatches and so batchID*2+2 are capped by number of threads
	template left(batchID: uint32): uint32 = 2*batchID+1
	template right(batchID: uint32): uint32 = 2*batchID+2

	let mid = (subsetStopEx+subsetStart) shr 1

	if subsetStopEx-subsetStart <= 1 or batchID.left >= numBatches:
	# Can't split or no need to split anymore
	# as there is more work than hardware threads/batches
	# and it's better to accumulate as much as possible in partial pairings
	# as merge cost is non-trivial (10k to 20k cycles).
	# So we want 1 merge per thread and no more.
	let ok = accumPairingLines(sigsets, contexts,
	batchID, subsetStart, subsetStopEx)
	if not ok:
	return false
	return true
	elif batchID.right >= numBatches:
	# No need to split, there is no right subtree
	# We directly accumulate in the current batch context
	let ok = accumPairingLines(sigsets, contexts,
	batchID, subsetStart, mid)
	if not ok:
	return false
	return true
	else:
	# Split in half, distribute the right and work on the left
	var leftOk{.exportc.}, rightOk{.exportc.} = false
	omp_task"shared(rightOk)":
	rightOk = dacPairing(sigsets, contexts, numBatches,
	batchID.right(), mid, subsetStopEx)
	leftOk = dacPairing(sigsets, contexts, numBatches,
	batchID.left(), subsetStart, mid)

	# Wait for all subtrees
	omp_taskwait()
	if not leftOk or not rightOk:
	return false
	# Merge left and right
	let mergeOk = contexts[batchID.left()].merge(contexts[batchID.right()])
	if not mergeOk:
	return false
	# Update our own pairing context - 3+MB copy
	contexts[batchID] = contexts[batchID.left()]
	return true

	{.pop.}