Skip to content

Instantly share code, notes, and snippets.

@jefflarkin
Created February 21, 2013 15:07
Show Gist options
  • Save jefflarkin/5005302 to your computer and use it in GitHub Desktop.
Save jefflarkin/5005302 to your computer and use it in GitHub Desktop.
program mm
use omp_lib
integer(8), parameter :: N = 4096
integer(8) :: i,j,k,tmp6,tmp2
real(8), dimension(N,N) :: A, B, C
real(8) :: tmp, chk, t0, t1, t2, t3
t0 = omp_get_wtime()
!$acc data create(A,B,C)
!$acc kernels
do i = 0, N-1
do j = 0, N-1
A(i+1,j+1) = i + j + 1d0
B(i+1,j+1) = i + j + 1d0
end do
end do
C(:,:) = 0d0
!$acc end kernels
t2 = omp_get_wtime()
!$acc kernels
do k = 1, N
do j = 1, N
do i = 1, N
C(i,k) = C(i,k) + A(i,j) * B(j,k)
end do
end do
end do
!$acc end kernels
!$acc wait
t3 = omp_get_wtime()
tmp6 = (N * ((2 * N) - 1) * (N - 1)) / 6
tmp2 = (N * (N - 1)) / 2
chk = 0
!$acc parallel loop reduction(+:chk)
do j = 0, N-1
do i = 0, N-1
tmp = tmp6 &
+ (((i + 1d0) + (j + 1d0)) * tmp2 ) &
+ (((i + 1d0) * (j + 1d0)) * N)
chk = chk + (C(i+1,j+1)/tmp)
end do
end do
!$acc end parallel loop
!$acc end data
t1 = omp_get_wtime()
write(*,'(I8," ",I8," ",F10.6," ",F10.6)') int(chk), (N*N), (t3-t2), (t1-t0)
end program
pgf90 -Minfo=all -fast -acc -ta=nvidia -I /opt/nvidia/cudatoolkit/default/include/ -o mmF90_acc mm1_acc.F90
mm:
9, Generating create(c(:,:))
Generating create(b(:,:))
Generating create(a(:,:))
10, Generating present_or_create(c(:,:))
Generating present_or_create(b(:,:))
Generating present_or_create(a(:,:))
Generating compute capability 1.3 binary
Generating compute capability 2.0 binary
11, Loop is parallelizable
12, Loop is parallelizable
Accelerator kernel generated
11, !$acc loop gang, vector(128) ! blockidx%x threadidx%x
12, !$acc loop gang ! blockidx%y
CC 1.3 : 13 registers; 40 shared, 8 constant, 0 local memory bytes
CC 2.0 : 19 registers; 0 shared, 56 constant, 0 local memory bytes
23, Generating present_or_create(c(:,:))
Generating present_or_create(a(:,:))
Generating present_or_create(b(:,:))
Generating compute capability 1.3 binary
Generating compute capability 2.0 binary
24, Loop is parallelizable
25, Loop carried dependence of 'c' prevents parallelization
Loop carried backward dependence of 'c' prevents vectorization
26, Loop is parallelizable
Accelerator kernel generated
24, !$acc loop gang, vector(4) ! blockidx%y threadidx%y
26, !$acc loop gang, vector(64) ! blockidx%x threadidx%x
CC 1.3 : 19 registers; 40 shared, 12 constant, 0 local memory bytes
CC 2.0 : 26 registers; 0 shared, 56 constant, 0 local memory bytes
38, Accelerator kernel generated
38, CC 1.3 : 30 registers; 32 shared, 32 constant, 0 local memory bytes
CC 2.0 : 30 registers; 0 shared, 80 constant, 0 local memory bytes
39, !$acc loop gang ! blockidx%x
40, !$acc loop vector(256) ! threadidx%x
38, Generating present_or_create(c(:,:))
Generating compute capability 1.3 binary
Generating compute capability 2.0 binary
40, Loop is parallelizable
$ aprun ./mmF90_acc
16777216 16777216 2.280605 3.387837
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment