Create working directory:
mkdir -p ~/forge/openmp4
cd ~/forge/openmp4
Clone LLVM & Compiler Runtime & Clang sources:
git clone http://llvm.org/git/llvm.git
cd ~/forge/openmp4/llvm/tools
git clone http://llvm.org/git/compiler-rt.git
cd ~/forge/openmp4/llvm/projects
git clone http://llvm.org/git/clang.git
Build & install Clang:
cd llvm/
mkdir build
cd build/
cmake -DCMAKE_INSTALL_PREFIX=~/forge/openmp4/llvm/install -DCMAKE_BUILD_TYPE=Release ..
make -j48
make install
export PATH=~/forge/openmp4/llvm/install/bin/:$PATH
export LD_LIBRARY_PATH=~/forge/openmp4/llvm/install/lib:$LD_LIBRARY_PATH
Clone, build & install OpenMP runtime:
cd ~/forge/openmp4
git clone http://llvm.org/git/openmp.git
cd openmp/runtime/
mkdir build
cd build/
export PATH=~/forge/openmp4/llvm/build/bin/:$PATH
cmake -DCMAKE_INSTALL_PREFIX=~/forge/openmp4/llvm/install -DCMAKE_BUILD_TYPE=Release ..
make -j48
make install
Clone, build & install OpenMP target backends (nvptx backend will support Compute Capabilities 30 and 35):
cd ~/forge/openmp4
git clone https://github.com/clang-omp/libomptarget.git
cd libomptarget
mkdir build
cd build
cmake -DCMAKE_INSTALL_PREFIX=~/forge/openmp4/llvm/install -DOMPTARGET_NVPTX_SM=30,35 -DCMAKE_BUILD_TYPE=Release ..
make -j48
cp -rf lib/libomptarget* ~/forge/openmp4/llvm/install/lib/
Checkout the sample program:
$ cd ~/forge/openmp4
$ cat example.c
#include <malloc.h>
#include <stdio.h>
#include <stdlib.h>
int main(int argc, char* argv[])
{
if (argc != 2)
{
printf("Usage: %s <n>\n", argv[0]);
return 0;
}
int n = atoi(argv[1]);
double* x = (double*)malloc(sizeof(double) * n);
double* y = (double*)malloc(sizeof(double) * n);
double idrandmax = 1.0 / RAND_MAX;
double a = idrandmax * rand();
for (int i = 0; i < n; i++)
{
x[i] = idrandmax * rand();
y[i] = idrandmax * rand();
}
#pragma omp target data map(tofrom: x[0:n],y[0:n])
{
#pragma omp target
#pragma omp for
for (int i = 0; i < n; i++)
y[i] += a * x[i];
}
double avg = 0.0, min = y[0], max = y[0];
for (int i = 0; i < n; i++)
{
avg += y[i];
if (y[i] > max) max = y[i];
if (y[i] < min) min = y[i];
}
printf("min = %f, max = %f, avg = %f\n", min, max, avg / n);
free(x);
free(y);
return 0;
}
$ cat makefile
all: example
example: example.c
LIBRARY_PATH=$(shell dirname $(shell which clang))/../lib clang -fopenmp -omptargets=nvptx64sm_30-nvidia-linux -g -O3 -std=c99 $< -o $@
clean:
rm -rf example
$ make
LIBRARY_PATH=~/forge/openmp4/llvm/install/bin/../lib clang -fopenmp -omptargets=nvptx64sm_30-nvidia-linux -g -O3 -std=c99 example.c -o example
ptxas warning : Too big maxrregcount value specified 64, will be ignored
$ nvprof ./example 1024
==29138== NVPROF is profiling process 29138, command: ./example 1024
min = 0.025126, max = 1.773771, avg = 0.922563
==29138== Profiling application: ./example 1024
==29138== Profiling result:
Time(%) Time Calls Avg Min Max Name
99.05% 3.3133ms 1 3.3133ms 3.3133ms 3.3133ms __omptgt__0_27a163e_801_
0.54% 18.047us 5 3.6090us 2.5600us 5.3430us [CUDA memcpy DtoH]
0.41% 13.568us 7 1.9380us 1.0880us 3.6160us [CUDA memcpy HtoD]
Verify results against simple serial version:
$ gcc -std=c99 example.c -o example_cpu
$ ./example_cpu 1024
min = 0.025126, max = 1.773771, avg = 0.922563