Skip to content

Instantly share code, notes, and snippets.

@vrootic
Created January 9, 2013 02:31
Show Gist options
  • Select an option

  • Save vrootic/4490054 to your computer and use it in GitHub Desktop.

Select an option

Save vrootic/4490054 to your computer and use it in GitHub Desktop.
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#define N 1024
#define TILE_WIDTH 8
__global__
void vecMutKernel (float* front, float* end, float* ans, int n)
{
__shared__ float ds_M[TILE_WIDTH][TILE_WIDTH];
__shared__ float ds_N[TILE_WIDTH][TILE_WIDTH];
int bx = blockIdx.x;
int by = blockIdx.y;
int tx = threadIdx.x;
int ty = threadIdx.y;
int row = by * TILE_WIDTH + ty;
int col = bx * TILE_WIDTH + tx;
float pvalue = 0;
for(int k=0; k < n/TILE_WIDTH; ++k){
ds_M[ty][tx] = 0;
ds_N[ty][tx] = 0;
ds_M[ty][tx] = front[row*n + k*TILE_WIDTH + tx];
ds_N[ty][tx] = end[col + (k*TILE_WIDTH + ty) * n ];
__syncthreads();
for(int i=0; i < TILE_WIDTH; ++i)
pvalue += ds_M[ty][i] * ds_N[k][tx];
__syncthreads();
}
ans[row*n + col] = pvalue;
}
void vecMut (float* front, float* end, float* ans, int n){
int size = n * n * sizeof(float);
float *front_d, *end_d, *ans_d;
clock_t start_t, end_t;
cudaMalloc((void **) &front_d, size);
cudaMalloc((void **) &end_d, size);
cudaMalloc((void **) &ans_d, size);
cudaMemcpy(front_d, front, size, cudaMemcpyHostToDevice);
cudaMemcpy(end_d, end, size, cudaMemcpyHostToDevice);
dim3 DimGrid((N-1)/TILE_WIDTH+1, (N-1)/TILE_WIDTH+1, 1);
dim3 DimBlock(TILE_WIDTH, TILE_WIDTH, 1);
start_t = clock();
vecMutKernel<<<DimGrid, DimBlock>>>(front_d, end_d, ans_d, N);
cudaDeviceSynchronize();
end_t = clock();
double diff = end_t - start_t;
printf("CUDA use %.3f s \n", diff/CLOCKS_PER_SEC);
cudaMemcpy(ans, ans_d, size, cudaMemcpyDeviceToHost);
cudaFree(front_d);
cudaFree(end_d);
cudaFree(ans_d);
}
void vecRandomize(float* A, int n)
{
for (int i=0; i<n; i++){
for(int j=0; j<n; ++j){
A[i*n + j] = (float) rand() / RAND_MAX;
}
}
}
void vecMutLoop (float* front, float* end, float* ans, int n){
for(int i=0; i<n; ++i){
for(int j=0; j<n; ++j){
double buffer = 0;
for(int k=0; k<n; ++k){
double a = front[i*n + k];
double b = end[k*n + j];
buffer += a * b;
}
ans[i*n+j] = buffer;
}
}
}
int main (void)
{
clock_t c_start, c_end, g_start, g_end;
float* A = (float*) malloc(N * N * sizeof(float));
float* B = (float*) malloc(N * N * sizeof(float));
float* C = (float*) malloc(N * N * sizeof(float));
float* Cloop = (float*) malloc(N * N * sizeof(float));
int size=N;
for(int i=0; i<size*size; ++i){
A[i] = 0;
B[i] = 0;
}
vecRandomize(A, N);
vecRandomize(B, N);
printf("After Randomize...\n");
/*for(int i=0; i<size*size; ++i){
printf("A[%d] = %f\n", i, A[i]);
}
for(int i=0; i<size*size; ++i){
printf("B[%d] = %f\n", i, B[i]);
}*/
printf("Processing GPU computing...\n");
vecMut(A, B, C, size);
printf("finish.\n");
/*printf("Processing CPU computing...\n");
c_start = clock();
vecMutLoop(A, B, Cloop, size);
c_end = clock();
printf("finish.\n");*/
// verify the results
/*for (int i=0; i<N*N; i++) {
if(Cloop[i] - C[i] != 0)
printf("%d CPU Result: Cloop[%d]=%f, CUDA Result: C[%d]=%f\n", i, i ,Cloop[i], i, C[i]);
}
double diff_c = c_end - c_start;
double diff_g = g_end - g_start;
printf("CPU : %.3f s, GPU : %.3f s\n", diff_c/1000, diff_g/1000);*/
free(A);
free(B);
free(C);
free(Cloop);
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment