Created
February 2, 2018 23:08
-
-
Save CraigglesO/bd1c6413e435fef891d8f35f51f61f4e to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// A C++ program for Bellman-Ford's single source | |
// shortest path algorithm. | |
// nvcc -std=c++11 BellmanFord.cu -D_MWAITXINTRIN_H_INCLUDED -D__STRICT_ANSI__ | |
#include <stdio.h> | |
#include <climits> | |
#include <random> | |
#include <chrono> | |
#include <ctime> | |
using namespace std; | |
using namespace std::chrono; | |
// a structure to represent a weighted edge in graph | |
struct Edge { | |
int src, dest, weight; | |
}; | |
// A utility function used to print the solution | |
void printArr(int dist[], int n) { | |
printf("Vertex \t\t Distance from Source\n"); | |
for (int i = 0; i < n; ++i) | |
printf("%d \t\t %d\n", i, dist[i]); | |
} | |
void BellmanFord_CPU(int V, int E, struct Edge* edges, int *dist, int src) { | |
// Step 1: Initialize distances from src to all other vertices | |
// as INFINITE | |
for (int i = 0; i < V; i++) | |
dist[i] = INT_MAX; | |
dist[src] = 0; | |
// Step 2: Relax all edges |V| - 1 times. A simple shortest | |
// path from src to any other vertex can have at-most |V| - 1 | |
// edges | |
for (int i = 1; i <= V-1; i++) { | |
for (int j = 0; j < E; j++) { | |
int u = edges[j].src; | |
int v = edges[j].dest; | |
int weight = edges[j].weight; | |
if (dist[u] != INT_MAX && dist[u] + weight < dist[v]) | |
dist[v] = dist[u] + weight; | |
} | |
} | |
return; | |
} | |
// The main function that finds shortest distances from src to | |
// all other vertices using Bellman-Ford algorithm. The function | |
// also detects negative weight cycle | |
__global__ void BellmanFord(int V, int E, struct Edge* edges, int *dist, int src) { | |
int index = blockIdx.x * blockDim.x + threadIdx.x; | |
int stride = blockDim.x * gridDim.x; | |
// Step 1: Initialize distances from src to all other vertices | |
// as INFINITE. While a little wasteful, V will always be smaller then E | |
for (int i = index; i < V; i += stride) | |
dist[i] = INT_MAX; | |
dist[src] = 0; | |
// Step 2: Relax all edges |V| - 1 times. A simple shortest | |
// path from src to any other vertex can have at-most |V| - 1 | |
// edges | |
for (int i = 1; i <= V - 1; i++) { | |
for (int j = index; j < E; j += stride) { | |
int u = edges[j].src; | |
int v = edges[j].dest; | |
int weight = edges[j].weight; | |
// NOTE: The issue here is that two threads might try to write to the | |
// same memory allocation (at the same time). To avoid this we should use atomics. | |
if (dist[u] != INT_MAX && dist[u] + weight < dist[v]) | |
atomicMin(&dist[v], dist[u] + weight); | |
} | |
__syncthreads(); | |
} | |
return; | |
} | |
// Driver program to test above functions | |
int main() { | |
cudaEvent_t start, stop; | |
float time; | |
cudaEventCreate(&start); | |
cudaEventCreate(&stop); | |
/* Let us create the graph given in above example */ | |
int V = 10000; // Number of vertices in graph | |
int E = 35000; // Number of edges in graph | |
int *dist_CPU, *dist; | |
Edge *edges; | |
edges = (Edge*)malloc(E*sizeof(Edge)); | |
dist_CPU = (int*)malloc(V*sizeof(int)); | |
dist = (int*)malloc(V*sizeof(int)); | |
Edge *d_edges; | |
int *d_dist; | |
// allocate memory | |
cudaMalloc(&d_edges, E * sizeof(Edge)); | |
cudaMalloc(&d_dist, V * sizeof(int)); | |
// ensure at least 1 connection | |
edges[0].src = 0; | |
edges[0].dest = 2; | |
edges[0].weight = 1; | |
for (int j = 1; j < E; j++) { | |
edges[j].src = rand()%(V) + 0; | |
edges[j].dest = rand()%(V) + 0; | |
edges[j].weight = rand()%(50 - 1 + 1) + 1; | |
// printf("j: %d - src: %d - dest: %d - weight: %d\n", j, edges[j].src, edges[j].dest, edges[j].weight); | |
} | |
/** CPU VERSION START **/ | |
high_resolution_clock::time_point cpu_start = high_resolution_clock::now(); | |
BellmanFord_CPU(V, E, edges, dist_CPU, 0); | |
high_resolution_clock::time_point cpu_stop = high_resolution_clock::now(); | |
long cpu_duration = duration_cast<milliseconds>( cpu_stop - cpu_start ).count(); | |
printf("CPU time: %ld\n", cpu_duration); | |
/** CPU VERSION END **/ | |
// setup threads | |
int blockSize = 256; | |
int numBlocks = (E + blockSize - 1) / blockSize; | |
// copy over to device | |
cudaMemcpy(d_edges, edges, E*sizeof(Edge), cudaMemcpyHostToDevice); | |
cudaMemcpy(d_dist, dist, V*sizeof(int), cudaMemcpyHostToDevice); | |
cudaEventRecord(start, 0); | |
BellmanFord<<<numBlocks, blockSize>>>(V, E, d_edges, d_dist, 0); | |
cudaEventRecord(stop, 0); | |
cudaEventSynchronize(stop); | |
// get result back to host | |
cudaMemcpy(dist, d_dist, V*sizeof(int), cudaMemcpyDeviceToHost); | |
// COMPARE: | |
// printArr(dist, V); | |
// printArr(dist_CPU, V); | |
int x = 0; | |
for (int i = 0; i < V; i++) { | |
if (dist[i] != dist_CPU[i]) { | |
// printf("dist[%i]: %i ", i, dist[i]); | |
// printf("dist_CPU[%i]: %i\n", i, dist_CPU[i]); | |
x++; | |
} | |
} | |
printf("number of differences: %i\n", x); | |
// cleanup | |
cudaFree(d_edges); | |
cudaFree(d_dist); | |
free(dist_CPU); | |
free(dist); | |
free(edges); | |
cudaEventElapsedTime(&time, start, stop); | |
cudaEventDestroy(start); | |
cudaEventDestroy(stop); | |
printf("Time for the kernel: %f ms\n", time); | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment