Created
December 9, 2020 11:28
-
-
Save mratsim/39073eb2c643ceb4ddeb8461911ebcff to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
func dacPairing[HashLen]( | |
sigsets: ptr UncheckedArray[SignatureSet[HashLen]], | |
contexts: ptr UncheckedArray[ContextMultiAggregateVerify[DST]], | |
numBatches: uint32, | |
batchID: uint32, | |
subsetStart: uint32, | |
subsetStopEx: uint32 | |
): bool = | |
## Distribute pairing computation using a recursive divide-and-conquer (DAC) algorithm | |
## | |
## Returns false if at least one partial pairing `update` failed. | |
## ⚠️ : We use 0-based indexing for the tree splitting | |
## root at 0 root at 1 | |
## Left child ix*2 + 1 ix*2 | |
## Right child ix*2 + 2 ix*2 + 1 | |
## Parent (ix-1)/2 ix/2 | |
# | |
# Rationale for divide and conquer. | |
# Parallel pairing computation requires the following steps | |
# | |
# Assuming we have N (public key, message, signature) triplets to verify | |
# on P processor/threads. | |
# We want B batches with B = P | |
# Each processing W work items with W = N/B or N/B + 1 | |
# | |
# Step 0: Initialize a context per parallel batch. | |
# Step 1: Compute partial pairings, W work items per thread. | |
# Step 2: Merge the B partial pairings | |
# | |
# For step 2 we have 2 strategies. | |
# Strategy A: a simple linear merge | |
# ``` | |
# for i in 1 ..< N: | |
# contexts[0].merge(contexts[i]) | |
# ``` | |
# which requires B operations. | |
# In that case we can get away with just a simple parallel for loop. | |
# and a serial linear merge for step 2 | |
# | |
# Strategy B: A divide-and-conquer algorithm | |
# We binary split the merge until we hit the base case: | |
# ``` | |
# contexts[i].merge(contexts[i+1]) | |
# ``` | |
# | |
# As pairing merge (Fp12 multiplication) is costly | |
# (~10000 CPU cycles on Skylake-X with ADCX/ADOX instructions) | |
# and for Ethereum we would at least have 6 sets: | |
# - block proposals signatures | |
# - randao reveal signatures | |
# - proposer slashings signatures | |
# - attester slashings signatures | |
# - attestations signatures | |
# - validator exits signatures | |
# not counting deposits signatures which may be invalid | |
# The merging would be 60k cycles if linear | |
# or 10k * log2(6) = 30k cycles if divide-and-conquer on 6+ cores | |
# Note that as the tree processing progresses, less threads are required | |
# for full parallelism so even with less than 6 cores, the speedup should be important. | |
# | |
# Hence we prefer strategy B. It would also be easier to port to a simple threadpool | |
# which doesn't support parallel for loop but do support plain task spawning. | |
# | |
# That said a pairing is about 3400k cycles so the optimization is only noticeable | |
# when we do multi-block batches, | |
# for example batching 20 blocks would require 1200k cycles for a linear merge. | |
# | |
# Since we use divide-and-conquer for step 2, we might as well use it for step 1 as well | |
# which would also allow us to remove synchronization barriers between step 1 and 2. | |
# - 1 for the master thread to check ctx.update(...) results and early return | |
# - 1 to ensure that the contextPtr array is fully updated before starting merge | |
# | |
# Note: Skylake-X is a very recent family, with bigint instructions MULX/ADCX/ADOX, | |
# multiply everything by 2~3 on a Raspberry Pi | |
# and scale by core frequency. | |
# 3000 cycles is 1ms at 3GHz. | |
# Overflow check: numBatches and so batchID*2+2 are capped by number of threads | |
template left(batchID: uint32): uint32 = 2*batchID+1 | |
template right(batchID: uint32): uint32 = 2*batchID+2 | |
let mid = (subsetStopEx+subsetStart) shr 1 | |
if subsetStopEx-subsetStart <= 1 or batchID.left >= numBatches: | |
# Can't split or no need to split anymore | |
# as there is more work than hardware threads/batches | |
# and it's better to accumulate as much as possible in partial pairings | |
# as merge cost is non-trivial (10k to 20k cycles). | |
# So we want 1 merge per thread and no more. | |
let ok = accumPairingLines(sigsets, contexts, | |
batchID, subsetStart, subsetStopEx) | |
if not ok: | |
return false | |
return true | |
elif batchID.right >= numBatches: | |
# No need to split, there is no right subtree | |
# We directly accumulate in the current batch context | |
let ok = accumPairingLines(sigsets, contexts, | |
batchID, subsetStart, mid) | |
if not ok: | |
return false | |
return true | |
else: | |
# Split in half, distribute the right and work on the left | |
var leftOk{.exportc.}, rightOk{.exportc.} = false | |
omp_task"shared(rightOk)": | |
rightOk = dacPairing(sigsets, contexts, numBatches, | |
batchID.right(), mid, subsetStopEx) | |
leftOk = dacPairing(sigsets, contexts, numBatches, | |
batchID.left(), subsetStart, mid) | |
# Wait for all subtrees | |
omp_taskwait() | |
if not leftOk or not rightOk: | |
return false | |
# Merge left and right | |
let mergeOk = contexts[batchID.left()].merge(contexts[batchID.right()]) | |
if not mergeOk: | |
return false | |
# Update our own pairing context - 3+MB copy | |
contexts[batchID] = contexts[batchID.left()] | |
return true | |
{.pop.} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment