Created
August 6, 2017 08:57
-
-
Save roastduck/b5cbe4f6371e8e3c1ddf594cbc4d27ff to your computer and use it in GitHub Desktop.
Blocked Floyd–Warshall algorithm implemented with CUDA
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <cstdio> | |
#include <cstdlib> | |
#include <cassert> | |
#include <cuda_runtime.h> | |
#ifdef DUAL_GPU | |
#include <omp.h> | |
#endif | |
#ifndef NDEBUG | |
#define CHKERR(call) assert((call) == cudaSuccess) // no NDEBUG, so assert must be performed | |
#else | |
#define CHKERR(call) (call) | |
#endif | |
#ifdef DUAL_GPU | |
#define GPU_NUM 2 | |
#define STREAM_NUM 4 | |
#else | |
#define GPU_NUM 1 | |
#define STREAM_NUM 1 | |
#endif | |
inline void fastRead(FILE *f, int &x) | |
{ | |
char c; | |
x = 0; | |
do c = fgetc(f); while (c < '0' || c > '9'); | |
do x = x * 10 + c - '0', c = fgetc(f); while (c >= '0' && c <= '9'); | |
} | |
/** Part of the graph | |
* This class is only a memory address convertor, which doesn't occupy the matrix | |
*/ | |
class Block | |
{ | |
public: | |
__host__ __device__ Block(int *_addr, int _height, int _width) | |
: addr(_addr), height(_height), width(_width) {} | |
__host__ __device__ int &operator()(int i, int j) | |
{ | |
assert(i >= 0 && i < height); | |
assert(j >= 0 && j < width); | |
return *(addr + i * width + j); | |
} | |
int count() const | |
{ | |
return height * width; | |
} | |
int *const addr; | |
const int height, width; | |
}; | |
/** A full graph | |
* This class is only a memory address convertor, which doesn't occupy the matrix | |
*/ | |
class Graph | |
{ | |
public: | |
Graph() | |
: addr(NULL), n(0), blkFactor(0), blkNum(0) {} | |
Graph(int *_addr, int _n, int _blkFactor) | |
: addr(_addr), n(_n), blkFactor(_blkFactor), blkNum(n / blkFactor + (n % blkFactor > 0)) {} | |
__host__ __device__ Block operator()(int i, int j) | |
{ | |
assert(i >= 0 && i < blkNum); | |
assert(j >= 0 && j < blkNum); | |
// DO NOT delete these two assertions for open interval use | |
// This function don't guarantee correctness when out of range | |
int height = min(blkFactor, n - i * blkFactor); | |
int width = min(blkFactor, n - j * blkFactor); | |
return Block(addr + (i * n + j * height) * blkFactor, height, width); | |
} | |
__host__ __device__ int &edge(int i, int j) | |
{ | |
assert(i >= 0 && i < n); | |
assert(j >= 0 && j < n); | |
return (*this)(i / blkFactor, j / blkFactor)(i % blkFactor, j % blkFactor); | |
} | |
void copyToDevice(const Graph &device, cudaStream_t stream) | |
{ | |
CHKERR(cudaMemcpyAsync(device.addr, addr, n * n * sizeof(int), cudaMemcpyHostToDevice, stream)); | |
} | |
void copyToHost(const Graph &host) | |
{ | |
CHKERR(cudaMemcpy(host.addr, addr, n * n * sizeof(int), cudaMemcpyDeviceToHost)); | |
} | |
private: | |
int *addr; // cannot be const because this object should be copyable | |
public: | |
int n, blkFactor, blkNum; | |
}; | |
extern __shared__ int sharedMem[]; | |
#define quad(id, begin, span, bound, expr) \ | |
if ((id = (begin)) < bound) { \ | |
expr; \ | |
if ((id = (begin) + (span)) < bound) { \ | |
expr; \ | |
if ((id = (begin) + 2 * (span)) < bound) { \ | |
expr; \ | |
if ((id = (begin) + 3 * (span)) < bound) { \ | |
expr; \ | |
} \ | |
} \ | |
} \ | |
} | |
__global__ void phase1Kernel(Graph graph, int pivotId) | |
{ | |
Block globalBlk(graph(pivotId, pivotId)); | |
Block sharedBlk(sharedMem, globalBlk.height, globalBlk.width); | |
const int tid = threadIdx.x, span = blockDim.x, bound = sharedBlk.height * sharedBlk.width; | |
int id; // placeholder | |
quad(id, tid, span, bound, *(sharedBlk.addr + id) = *(globalBlk.addr + id)); | |
__syncthreads(); | |
assert(sharedBlk.width == sharedBlk.height); | |
#pragma unroll 64 | |
for (int k = 0; k < sharedBlk.width; k++) | |
{ | |
int i, j; | |
quad(id, tid, span, bound, | |
i = id / sharedBlk.width; j = id % sharedBlk.width; // can't use comma here because of the macro | |
sharedBlk(i, j) = min(sharedBlk(i, j), sharedBlk(i, k) + sharedBlk(k, j)) | |
); | |
__syncthreads(); | |
} | |
quad(id, tid, span, bound, *(globalBlk.addr + id) = *(sharedBlk.addr + id)); | |
} | |
__global__ void phase2Kernel(Graph graph, int pivotId, int offset) | |
{ | |
Block globalPivotBlk(graph(pivotId, pivotId)); | |
int column = blockIdx.x; // 0 = row, 1 = column | |
assert(column == 0 || column == 1); | |
int vassalId = blockIdx.y + (blockIdx.y >= pivotId); | |
Block globalVassalBlk = column ? graph(vassalId, pivotId) : graph(pivotId, vassalId); | |
Block sharedPivotBlk(sharedMem, globalPivotBlk.height, globalPivotBlk.width); | |
Block sharedVassalBlk(sharedMem + sharedPivotBlk.width * sharedPivotBlk.height, globalVassalBlk.height, globalVassalBlk.width); | |
const int tid = threadIdx.x, span = blockDim.x; | |
int id; // placeholder | |
quad(id, tid, span, sharedPivotBlk.height * sharedPivotBlk.width, *(sharedPivotBlk.addr + id) = *(globalPivotBlk.addr + id)); | |
quad(id, tid, span, sharedVassalBlk.height * sharedVassalBlk.width, *(sharedVassalBlk.addr + id) = *(globalVassalBlk.addr + id)); | |
__syncthreads(); | |
assert(sharedPivotBlk.width == sharedPivotBlk.height); | |
#pragma unroll 64 | |
for (int k = 0; k < sharedPivotBlk.width; k++) | |
{ | |
int i, j; | |
quad(id, tid, span, sharedVassalBlk.height * sharedVassalBlk.width, | |
i = id / sharedVassalBlk.width; j = id % sharedVassalBlk.width; | |
sharedVassalBlk(i, j) = min( | |
sharedVassalBlk(i, j), | |
column ? sharedVassalBlk(i, k) + sharedPivotBlk(k, j) : sharedPivotBlk(i, k) + sharedVassalBlk(k, j) | |
); | |
); | |
__syncthreads(); | |
} | |
quad(id, tid, span, sharedVassalBlk.height * sharedVassalBlk.width, *(globalVassalBlk.addr + id) = *(sharedVassalBlk.addr + id)); | |
} | |
__global__ void phase3Kernel(Graph graph, int pivotId, int offset) | |
{ | |
const int rowId = blockIdx.x + offset + (blockIdx.x + offset >= pivotId); | |
Block globalLRBlk = graph(rowId, pivotId); | |
Block sharedLRBlk(sharedMem, globalLRBlk.height, globalLRBlk.width); | |
const int tid = threadIdx.x, span = blockDim.x; | |
int id; // placeholder | |
quad(id, tid, span, sharedLRBlk.height * sharedLRBlk.width, *(sharedLRBlk.addr + id) = *(globalLRBlk.addr + id)); | |
for (int colId = !pivotId; colId < graph.blkNum; colId++, colId += (colId == pivotId)) | |
{ | |
Block globalUDBlk = graph(pivotId, colId); | |
Block sharedUDBlk(sharedMem + sharedLRBlk.width * sharedLRBlk.height, globalUDBlk.height, globalUDBlk.width); | |
quad(id, tid, span, sharedUDBlk.height * sharedUDBlk.width, *(sharedUDBlk.addr + id) = *(globalUDBlk.addr + id)); | |
__syncthreads(); | |
Block globalVassalBlk = graph(rowId, colId); | |
#pragma unroll 4 | |
for (int iter = 0; iter < 4 && (id = tid + iter * span) < globalVassalBlk.height * globalVassalBlk.width; iter++) | |
{ | |
// can't use macro here, because here is pragma | |
const int i = id / globalVassalBlk.width, j = id % globalVassalBlk.width; | |
int res = *(globalVassalBlk.addr + id); | |
#pragma unroll 64 | |
for (int k = 0; k < sharedLRBlk.width; k++) | |
res = min(res, sharedLRBlk(i, k) + sharedUDBlk(k, j)); | |
*(globalVassalBlk.addr + id) = res; | |
} | |
__syncthreads(); // Prevent the next assignment coming too soon | |
} | |
} | |
#ifdef DUAL_GPU | |
inline void phase3TaskCopy(int srcDev, int dstDev, Graph graphs[], cudaStream_t stream, int blkId, int beginTask, int endTask) | |
{ | |
int n = graphs[0].n, blkFactor = graphs[0].blkFactor; | |
int beginId = beginTask + (beginTask >= blkId), endId = endTask + (endTask - 1 >= blkId); | |
assert(endId > beginId); | |
if (blkId >= beginId && blkId < endId) // Not interfering other stream at the same device | |
{ | |
CHKERR(cudaMemcpyPeerAsync( | |
graphs[dstDev](beginId, 0).addr, | |
dstDev, | |
graphs[srcDev](beginId, 0).addr, | |
srcDev, | |
(blkId - beginId) * blkFactor * n * sizeof(int), | |
stream | |
)); | |
CHKERR(cudaMemcpyPeerAsync( | |
graphs[dstDev](blkId + 1, 0).addr, | |
dstDev, | |
graphs[srcDev](blkId + 1, 0).addr, | |
srcDev, | |
(min(n, endId * blkFactor) - (blkId + 1) * blkFactor) * n * sizeof(int), | |
stream | |
)); | |
} else | |
CHKERR(cudaMemcpyPeerAsync( | |
graphs[dstDev](beginId, 0).addr, | |
dstDev, | |
graphs[srcDev](beginId, 0).addr, | |
srcDev, | |
(min(n, endId * blkFactor) - beginId * blkFactor) * n * sizeof(int), | |
stream | |
)); | |
} | |
template <void(*kernel)(Graph,int,int), void(*taskCopy)(int,int,Graph*,cudaStream_t,int,int,int)> | |
void dispatch(int taskNum, int threadsNum, int sharedSize, Graph graphs[], cudaStream_t streams[], int blkId) | |
{ | |
const int DYNAMIC_CHUNK_SIZE = 14 * 1; // 14 = # of SM | |
#pragma omp parallel for num_threads(2) schedule(dynamic, 1) | |
for (int st = 0; st < taskNum; st += DYNAMIC_CHUNK_SIZE) | |
{ | |
int tid = omp_get_thread_num(); | |
assert(tid == 0 || tid == 1); | |
CHKERR(cudaSetDevice(tid)); | |
int en = min(taskNum, st + DYNAMIC_CHUNK_SIZE); | |
kernel<<<en - st, threadsNum, sharedSize, streams[tid]>>>(graphs[tid], blkId, st); | |
CHKERR(cudaStreamSynchronize(streams[tid])); | |
taskCopy(tid, !tid, graphs, streams[tid + 2], blkId, st, en); | |
} | |
for (int i = 0; i < GPU_NUM; i++) | |
{ | |
CHKERR(cudaSetDevice(i)); | |
CHKERR(cudaDeviceSynchronize()); | |
} | |
} | |
#endif // DUAL_GPU | |
#define div4(x) (((x) >> 2) + bool((x) & 3)) | |
void stage(Graph graphs[], cudaStream_t streams[], int blkId) | |
{ | |
int pivotCnt = graphs[0](blkId, blkId).count(), maxCnt = graphs[0].blkFactor * graphs[0].blkFactor; | |
int pivotSize = pivotCnt * sizeof(int), maxSize = maxCnt * sizeof(int); | |
for (int i = 0; i < GPU_NUM; i++) | |
{ | |
CHKERR(cudaSetDevice(i)); | |
phase1Kernel<<<1, div4(pivotCnt), pivotSize, streams[i]>>>(graphs[i], blkId); | |
if (graphs[0].blkNum > 1) | |
{ | |
phase2Kernel<<<dim3(2, graphs[0].blkNum - 1), div4(maxCnt), pivotSize + maxSize, streams[i]>>>(graphs[i], blkId, 0); | |
#ifndef DUAL_GPU | |
phase3Kernel<<<graphs[0].blkNum - 1, div4(maxCnt), 2 * maxSize, streams[i]>>>(graphs[i], blkId, 0); | |
#endif | |
} | |
} | |
#ifdef DUAL_GPU | |
if (graphs[0].blkNum > 1) | |
dispatch<phase3Kernel, phase3TaskCopy>(graphs[0].blkNum - 1, div4(maxCnt), 2 * maxSize, graphs, streams, blkId); | |
#endif | |
} | |
int main(const int argc, const char *const *argv) | |
{ | |
if (argc != 4) | |
{ | |
puts("Usage: ./<executable name> <input file> <output file> <blocking factor>"); | |
return 1; | |
} | |
const char *inName = argv[1], *outName = argv[2]; | |
const int blkFactor = atoi(argv[3]); | |
static char inBuf[1 << 20]; | |
FILE *inFile = fopen(inName, "r"); | |
setvbuf(inFile, inBuf, _IOFBF, 1 << 20); | |
int n, m; | |
fastRead(inFile, n), fastRead(inFile, m); | |
#ifdef DUAL_GPU | |
{ | |
cudaError_t err; | |
CHKERR(cudaSetDevice(0)); | |
err = cudaDeviceEnablePeerAccess(1, 0), assert(err == cudaSuccess || err == cudaErrorPeerAccessAlreadyEnabled); | |
CHKERR(cudaSetDevice(1)); | |
err = cudaDeviceEnablePeerAccess(0, 0), assert(err == cudaSuccess || err == cudaErrorPeerAccessAlreadyEnabled); | |
CHKERR(cudaSetDevice(0)); | |
} | |
#endif | |
int *hostAddr; | |
CHKERR(cudaMallocHost((void**)&hostAddr, n * n * sizeof(int))); | |
Graph hostGraph(hostAddr, n, blkFactor); | |
int *deviceAddrs[GPU_NUM]; | |
Graph deviceGraphs[GPU_NUM]; | |
for (int i = 0; i < GPU_NUM; i++) | |
{ | |
CHKERR(cudaSetDevice(i)); | |
CHKERR(cudaMalloc((void**)(deviceAddrs + i), n * n * sizeof(int))); | |
deviceGraphs[i] = Graph(deviceAddrs[i], n, blkFactor); | |
} | |
cudaStream_t streams[STREAM_NUM]; // Default stream can't work correctly with cudaMemcpyPeerAsnyc | |
for (int i = 0; i < STREAM_NUM; i++) | |
{ | |
CHKERR(cudaSetDevice(i & 1)); | |
CHKERR(cudaStreamCreate(streams + i)); | |
} | |
CHKERR(cudaSetDevice(0)); | |
memset(hostAddr, 0x01, n * n * sizeof(int)); | |
for (int i = 0; i < m; i++) | |
{ | |
int x, y, w; | |
fastRead(inFile, x), fastRead(inFile, y), fastRead(inFile, w); | |
hostGraph.edge(x - 1, y - 1) = w; | |
} | |
fclose(inFile); | |
for (int i = 0; i < GPU_NUM; i++) | |
{ | |
CHKERR(cudaSetDevice(i)); | |
hostGraph.copyToDevice(deviceGraphs[i], streams[i]); | |
} | |
for (int i = 0; i < deviceGraphs[0].blkNum; i++) | |
stage(deviceGraphs, streams, i); | |
CHKERR(cudaSetDevice(0)); | |
deviceGraphs[0].copyToHost(hostGraph); | |
FILE *outFile = fopen(outName, "w"); | |
for (int i = 0; i < n; i++) | |
{ | |
for (int j = 0; j < n; j++) | |
{ | |
int res = i == j ? 0 : hostGraph.edge(i, j); | |
if (res == 0x01010101) | |
fprintf(outFile, "%s ", "INF"); | |
else | |
fprintf(outFile, "%d ", res); | |
} | |
fputc('\n', outFile); | |
} | |
fclose(outFile); | |
CHKERR(cudaFreeHost(hostAddr)); | |
for (int i = 0; i < GPU_NUM; i++) | |
{ | |
CHKERR(cudaSetDevice(i)); | |
CHKERR(cudaFree(deviceAddrs[i])); | |
} | |
for (int i = 0; i < STREAM_NUM; i++) | |
{ | |
CHKERR(cudaSetDevice(i & 1)); | |
CHKERR(cudaStreamDestroy(streams[i])); | |
} | |
CHKERR(cudaSetDevice(0)); | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment