Last active
August 16, 2022 23:15
-
-
Save rmcgibbo/7178576 to your computer and use it in GitHub Desktop.
Efficient MPI Parallel Reduction Without MPI_Reduce or MPI_Scan (only Send/Recv)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* | |
* An efficient MPI parallel reduction without MPI_Scan or MPI_Reduce. (i.e. | |
* only send/recv). | |
* | |
* TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION | |
* 0. You just DO WHAT THE FUCK YOU WANT TO. | |
*/ | |
#include <mpi.h> | |
#include <cstdio> | |
#include <vector> | |
static int fastlog2(uint32_t v); | |
int MPI_ManualReduce(int value) { | |
/* | |
* Reduces values on all processes to a single value on rank 0. | |
* | |
* This function does the same thing as the function MPI_Reduce using | |
* only MPI_Send and MPI_Recv. As shown, it operates with additions on | |
* integers, so you could trivially use MPI_Reduce, but for operations | |
* on variable size structs for which you cannot define an MPI_Datatype, | |
* you can still use this method, by modifying it to use your op | |
* and your datastructure. | |
*/ | |
int recvbuffer; | |
int tag = 0; | |
const int size = MPI::COMM_WORLD.Get_size(); | |
const int rank = MPI::COMM_WORLD.Get_rank(); | |
const int lastpower = 1 << fastlog2(size); | |
// each of the ranks greater than the last power of 2 less than size | |
// need to downshift their data, since the binary tree reduction below | |
// only works when N is a power of two. | |
for (int i = lastpower; i < size; i++) | |
if (rank == i) | |
MPI::COMM_WORLD.Send(&value, 1, MPI_INT, i-lastpower, tag); | |
for (int i = 0; i < size-lastpower; i++) | |
if (rank == i) { | |
MPI::COMM_WORLD.Recv(&recvbuffer, 1, MPI_INT, i+lastpower, tag); | |
value += recvbuffer; // your operation | |
} | |
for (int d = 0; d < fastlog2(lastpower); d++) | |
for (int k = 0; k < lastpower; k += 1 << (d + 1)) { | |
const int receiver = k; | |
const int sender = k + (1 << d); | |
if (rank == receiver) { | |
MPI::COMM_WORLD.Recv(&recvbuffer, 1, MPI_INT, sender, tag); | |
value += recvbuffer; // your operation | |
} | |
else if (rank == sender) | |
MPI::COMM_WORLD.Send(&value, 1, MPI_INT, receiver, tag); | |
} | |
return value; | |
} | |
static int fastlog2(uint32_t v) { | |
// http://graphics.stanford.edu/~seander/bithacks.html | |
int r; | |
static const int MultiplyDeBruijnBitPosition[32] = | |
{ | |
0, 9, 1, 10, 13, 21, 2, 29, 11, 14, 16, 18, 22, 25, 3, 30, | |
8, 12, 20, 28, 15, 17, 24, 7, 19, 27, 23, 6, 26, 5, 4, 31 | |
}; | |
v |= v >> 1; // first round down to one less than a power of 2 | |
v |= v >> 2; | |
v |= v >> 4; | |
v |= v >> 8; | |
v |= v >> 16; | |
r = MultiplyDeBruijnBitPosition[(uint32_t)(v * 0x07C4ACDDU) >> 27]; | |
return r; | |
} | |
int main(int argc, char* argv[]) { | |
MPI::Init(argc, argv); | |
const int size = MPI::COMM_WORLD.Get_size(); | |
const int rank = MPI::COMM_WORLD.Get_rank(); | |
std::vector<int> data(size); | |
for (int i = 0; i < size; i++) | |
data[i] = i; | |
// each rank only gets one entry, and | |
// they need to sum them by sending messages | |
int result = MPI_ManualReduce(data[rank]); | |
MPI::COMM_WORLD.Barrier(); | |
// Compute the correct result | |
int sum = 0; | |
for (int i = 0; i < size; i++) | |
sum += data[i]; | |
if (rank == 0) { | |
printf("MPI Result = %d\n", result); | |
printf("Correct Result = %d\n", sum); | |
} | |
MPI::Finalize(); | |
} |
Author
rmcgibbo
commented
Oct 27, 2013
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment