Instantly share code, notes, and snippets.
Created
January 9, 2013 02:31
-
Star
0
(0)
You must be signed in to star a gist -
Fork
0
(0)
You must be signed in to fork a gist
-
-
Save vrootic/4490054 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #include <stdio.h> | |
| #include <stdlib.h> | |
| #include <time.h> | |
| #define N 1024 | |
| #define TILE_WIDTH 8 | |
| __global__ | |
| void vecMutKernel (float* front, float* end, float* ans, int n) | |
| { | |
| __shared__ float ds_M[TILE_WIDTH][TILE_WIDTH]; | |
| __shared__ float ds_N[TILE_WIDTH][TILE_WIDTH]; | |
| int bx = blockIdx.x; | |
| int by = blockIdx.y; | |
| int tx = threadIdx.x; | |
| int ty = threadIdx.y; | |
| int row = by * TILE_WIDTH + ty; | |
| int col = bx * TILE_WIDTH + tx; | |
| float pvalue = 0; | |
| for(int k=0; k < n/TILE_WIDTH; ++k){ | |
| ds_M[ty][tx] = 0; | |
| ds_N[ty][tx] = 0; | |
| ds_M[ty][tx] = front[row*n + k*TILE_WIDTH + tx]; | |
| ds_N[ty][tx] = end[col + (k*TILE_WIDTH + ty) * n ]; | |
| __syncthreads(); | |
| for(int i=0; i < TILE_WIDTH; ++i) | |
| pvalue += ds_M[ty][i] * ds_N[k][tx]; | |
| __syncthreads(); | |
| } | |
| ans[row*n + col] = pvalue; | |
| } | |
| void vecMut (float* front, float* end, float* ans, int n){ | |
| int size = n * n * sizeof(float); | |
| float *front_d, *end_d, *ans_d; | |
| clock_t start_t, end_t; | |
| cudaMalloc((void **) &front_d, size); | |
| cudaMalloc((void **) &end_d, size); | |
| cudaMalloc((void **) &ans_d, size); | |
| cudaMemcpy(front_d, front, size, cudaMemcpyHostToDevice); | |
| cudaMemcpy(end_d, end, size, cudaMemcpyHostToDevice); | |
| dim3 DimGrid((N-1)/TILE_WIDTH+1, (N-1)/TILE_WIDTH+1, 1); | |
| dim3 DimBlock(TILE_WIDTH, TILE_WIDTH, 1); | |
| start_t = clock(); | |
| vecMutKernel<<<DimGrid, DimBlock>>>(front_d, end_d, ans_d, N); | |
| cudaDeviceSynchronize(); | |
| end_t = clock(); | |
| double diff = end_t - start_t; | |
| printf("CUDA use %.3f s \n", diff/CLOCKS_PER_SEC); | |
| cudaMemcpy(ans, ans_d, size, cudaMemcpyDeviceToHost); | |
| cudaFree(front_d); | |
| cudaFree(end_d); | |
| cudaFree(ans_d); | |
| } | |
| void vecRandomize(float* A, int n) | |
| { | |
| for (int i=0; i<n; i++){ | |
| for(int j=0; j<n; ++j){ | |
| A[i*n + j] = (float) rand() / RAND_MAX; | |
| } | |
| } | |
| } | |
| void vecMutLoop (float* front, float* end, float* ans, int n){ | |
| for(int i=0; i<n; ++i){ | |
| for(int j=0; j<n; ++j){ | |
| double buffer = 0; | |
| for(int k=0; k<n; ++k){ | |
| double a = front[i*n + k]; | |
| double b = end[k*n + j]; | |
| buffer += a * b; | |
| } | |
| ans[i*n+j] = buffer; | |
| } | |
| } | |
| } | |
| int main (void) | |
| { | |
| clock_t c_start, c_end, g_start, g_end; | |
| float* A = (float*) malloc(N * N * sizeof(float)); | |
| float* B = (float*) malloc(N * N * sizeof(float)); | |
| float* C = (float*) malloc(N * N * sizeof(float)); | |
| float* Cloop = (float*) malloc(N * N * sizeof(float)); | |
| int size=N; | |
| for(int i=0; i<size*size; ++i){ | |
| A[i] = 0; | |
| B[i] = 0; | |
| } | |
| vecRandomize(A, N); | |
| vecRandomize(B, N); | |
| printf("After Randomize...\n"); | |
| /*for(int i=0; i<size*size; ++i){ | |
| printf("A[%d] = %f\n", i, A[i]); | |
| } | |
| for(int i=0; i<size*size; ++i){ | |
| printf("B[%d] = %f\n", i, B[i]); | |
| }*/ | |
| printf("Processing GPU computing...\n"); | |
| vecMut(A, B, C, size); | |
| printf("finish.\n"); | |
| /*printf("Processing CPU computing...\n"); | |
| c_start = clock(); | |
| vecMutLoop(A, B, Cloop, size); | |
| c_end = clock(); | |
| printf("finish.\n");*/ | |
| // verify the results | |
| /*for (int i=0; i<N*N; i++) { | |
| if(Cloop[i] - C[i] != 0) | |
| printf("%d CPU Result: Cloop[%d]=%f, CUDA Result: C[%d]=%f\n", i, i ,Cloop[i], i, C[i]); | |
| } | |
| double diff_c = c_end - c_start; | |
| double diff_g = g_end - g_start; | |
| printf("CPU : %.3f s, GPU : %.3f s\n", diff_c/1000, diff_g/1000);*/ | |
| free(A); | |
| free(B); | |
| free(C); | |
| free(Cloop); | |
| return 0; | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment