Skip to content

Instantly share code, notes, and snippets.

@moretea
Created December 9, 2013 20:19
Show Gist options
  • Select an option

  • Save moretea/7880085 to your computer and use it in GitHub Desktop.

Select an option

Save moretea/7880085 to your computer and use it in GitHub Desktop.
#ifdef MATMUL_ACC
void matmul(float *a_ptr, float *b_ptr, float *c_ptr, int len, int times) {
float (*a)[len][len] = (float (*)[len][len]) a_ptr;
float (*b)[len][len] = (float (*)[len][len]) b_ptr;
float (*c)[len][len] = (float (*)[len][len]) c_ptr;
size_t bytes = sizeof(float) * len * len;
float *a_dev = acc_malloc(bytes);
float *b_dev = acc_malloc(bytes);
float *c_dev = acc_malloc(bytes);
int k = times;
#pragma acc data
{
acc_memcpy_to_device(a_dev, a, bytes);
acc_memcpy_to_device(b_dev, b, bytes);
while(k-- > 0) {
#pragma acc parallel deviceptr(a_dev, b_dev, c_dev)
{
float (*a)[len][len] = (float (*)[len][len]) a_dev;
float (*b)[len][len] = (float (*)[len][len]) b_dev;
float (*c)[len][len] = (float (*)[len][len]) c_dev;
#pragma acc loop gang
for (int y = 0; y < len; y++) {
for (int x = 0; x < len; x++) {
float result = 0;
#pragma acc loop worker reduction(+:result)
for (int j = 0; j < len; j++) {
result += (*a)[x][j] * (*b)[j][y];
}
(*c)[y][x] = result;
}
}
} // end of parallell
// switch pointers
if (k > 0) { float *x = a_dev; a_dev = c_dev; c_dev = x; }
}
if ((times % 2) == 1 && times != 0) {
/* copy back once */
#pragma acc parallel deviceptr(a_dev, b_dev, c_dev)
{
float (*a)[len][len] = (float (*)[len][len]) a_dev;
float (*b)[len][len] = (float (*)[len][len]) b_dev;
float (*c)[len][len] = (float (*)[len][len]) c_dev;
#pragma acc loop gang
for (int y = 0; y < len; y++) {
#pragma acc loop worker
for (int x = 0; x < len; x++) {
float tmp = ((*a)[y][x]);
((*a)[y][x]) = ((*c)[y][x]);
((*c)[y][x]) = tmp;
}
}
}
}
acc_memcpy_from_device(c, c_dev, bytes);
}
}
#endif
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment