Skip to content

Instantly share code, notes, and snippets.

@PirosB3
Created October 30, 2013 13:44
Show Gist options
  • Save PirosB3/7232924 to your computer and use it in GitHub Desktop.
Save PirosB3/7232924 to your computer and use it in GitHub Desktop.
Calculate weight matrix in CUDA
__global__ void calculateOutputForLayer(float *inputs, float *weight_matrix, int n_inputs, int n_outputs, float *output_matrix) {
output_matrix[n_outputs * threadIdx.y + threadIdx.x] = inputs[threadIdx.y] * weight_matrix[n_outputs * threadIdx.y + threadIdx.x];
}
int main() {
// Build inputs and weights
int n_inputs = 4;
int n_outputs = 2;
float inputs[4] = {1,2,3,4};
float weights[8] = {2,4,2,2,2,7,2,2};
// Build output matrix
float *d_output;
cudaMalloc((void**)&d_output, sizeof(float) * n_inputs * n_outputs);
// Copy layers
float *d_inputs;
cudaMalloc((void**)&d_inputs, sizeof(float) * 4);
cudaMemcpy(d_inputs, &inputs, sizeof(float) * 4, cudaMemcpyHostToDevice);
float *d_weights;
cudaMalloc((void**)&d_weights, sizeof(float) * 8);
cudaMemcpy(d_weights, &weights, sizeof(float) * 8, cudaMemcpyHostToDevice);
// Run compute kernel
dim3 block(n_outputs, n_inputs);
calculateOutputForLayer<<<1, block>>>(d_inputs, d_weights, n_inputs, n_outputs, d_output);
// Copy back output
float *output = (float *)malloc(sizeof(float) * n_inputs * n_outputs);
cudaMemcpy(output, d_output, sizeof(float) * n_inputs * n_outputs, cudaMemcpyDeviceToHost);
for (int i=0; i < n_inputs * n_outputs; i++) {
printf("%f\n", output[i]);
}
cudaDeviceReset();
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment