Skip to content

Instantly share code, notes, and snippets.

@dmikushin
Last active January 27, 2025 12:38
Show Gist options
  • Save dmikushin/2c34ae5cc6c567110f5a776dd333592f to your computer and use it in GitHub Desktop.
Save dmikushin/2c34ae5cc6c567110f5a776dd333592f to your computer and use it in GitHub Desktop.
OpenMP target offload with fallback to OpenMP multithreading

This sample demonstrates how to conditionaly disable "target" portion of OpenMP directives, so that it falls back to CPU OpenMP multithreading.

This approach follows SC17 tutorial by Jeff Larkin.

> make
LIBRARY_PATH=/usr/lib/llvm-19/lib clang-19 -gline-tables-only -O3 -fopenmp -fopenmp-targets=nvptx64-nvidia-cuda -Xopenmp-target=nvptx64-nvidia-cuda --offload-target=sm_86 -fuse-ld=lld offload-fallback.c -o offload-fallback
> make test_gpu
USE_GPU=1 LD_LIBRARY_PATH=/usr/lib/llvm-19/lib ./offload-fallback
result[0] = 0.000000
result[1] = 4.000000
result[2] = 8.000000
result[3] = 12.000000
result[4] = 16.000000
result[5] = 20.000000
result[6] = 24.000000
result[7] = 28.000000
result[8] = 32.000000
result[9] = 36.000000
> make test_cpu
USE_GPU=0 LD_LIBRARY_PATH=/usr/lib/llvm-19/lib ./offload-fallback
result[0] = 0.000000
result[1] = 4.000000
result[2] = 8.000000
result[3] = 12.000000
result[4] = 16.000000
result[5] = 20.000000
result[6] = 24.000000
result[7] = 28.000000
result[8] = 32.000000
result[9] = 36.000000

The latest Clang 19 is used above, the same works for Clang 17, however there is a bug concerning OpenMP loops inside if constexpr(...) branches.

LLVM_VERSION=19
ifeq ($(LLVM_VERSION),19)
CC=LIBRARY_PATH=/usr/lib/llvm-$(LLVM_VERSION)/lib clang-$(LLVM_VERSION)
else
CC=clang-$(LLVM_VERSION)
endif
.PHONY: test_gpu test_cpu
all: offload-fallback
offload-fallback: offload-fallback.c
$(CC) -gline-tables-only -O3 -fopenmp -fopenmp-targets=nvptx64-nvidia-cuda -Xopenmp-target=nvptx64-nvidia-cuda --offload-target=sm_86 -fuse-ld=lld $< -o $@
clean:
rm -rf offload-fallback
test_gpu: offload-fallback
USE_GPU=1 LD_LIBRARY_PATH=/usr/lib/llvm-$(LLVM_VERSION)/lib ./$<
test_cpu: offload-fallback
USE_GPU=0 LD_LIBRARY_PATH=/usr/lib/llvm-$(LLVM_VERSION)/lib ./$<
#include <stdio.h>
#include <omp.h>
#define N 1000
int use_gpu = 1;
void saxpy(float a, float *x, float *y, float *result, int n) {
#pragma omp target teams distribute parallel for map(to: x[0:n], y[0:n]) map(from: result[0:n]) if(target:use_gpu)
for (int i = 0; i < n; i++) {
result[i] = a * x[i] + y[i];
}
}
int main() {
char* cuse_gpu = getenv("USE_GPU");
if (cuse_gpu && atoi(cuse_gpu))
use_gpu = 1;
else
use_gpu = 0;
float a = 2.0f;
float x[N], y[N], result[N];
// Initialize arrays
for (int i = 0; i < N; i++) {
x[i] = i * 1.0f;
y[i] = i * 2.0f;
}
// Perform SAXPY
saxpy(a, x, y, result, N);
// Print some results for verification
for (int i = 0; i < 10; i++) {
printf("result[%d] = %f\n", i, result[i]);
}
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment