Created
February 21, 2013 15:07
-
-
Save jefflarkin/5005302 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
program mm | |
use omp_lib | |
integer(8), parameter :: N = 4096 | |
integer(8) :: i,j,k,tmp6,tmp2 | |
real(8), dimension(N,N) :: A, B, C | |
real(8) :: tmp, chk, t0, t1, t2, t3 | |
t0 = omp_get_wtime() | |
!$acc data create(A,B,C) | |
!$acc kernels | |
do i = 0, N-1 | |
do j = 0, N-1 | |
A(i+1,j+1) = i + j + 1d0 | |
B(i+1,j+1) = i + j + 1d0 | |
end do | |
end do | |
C(:,:) = 0d0 | |
!$acc end kernels | |
t2 = omp_get_wtime() | |
!$acc kernels | |
do k = 1, N | |
do j = 1, N | |
do i = 1, N | |
C(i,k) = C(i,k) + A(i,j) * B(j,k) | |
end do | |
end do | |
end do | |
!$acc end kernels | |
!$acc wait | |
t3 = omp_get_wtime() | |
tmp6 = (N * ((2 * N) - 1) * (N - 1)) / 6 | |
tmp2 = (N * (N - 1)) / 2 | |
chk = 0 | |
!$acc parallel loop reduction(+:chk) | |
do j = 0, N-1 | |
do i = 0, N-1 | |
tmp = tmp6 & | |
+ (((i + 1d0) + (j + 1d0)) * tmp2 ) & | |
+ (((i + 1d0) * (j + 1d0)) * N) | |
chk = chk + (C(i+1,j+1)/tmp) | |
end do | |
end do | |
!$acc end parallel loop | |
!$acc end data | |
t1 = omp_get_wtime() | |
write(*,'(I8," ",I8," ",F10.6," ",F10.6)') int(chk), (N*N), (t3-t2), (t1-t0) | |
end program |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
pgf90 -Minfo=all -fast -acc -ta=nvidia -I /opt/nvidia/cudatoolkit/default/include/ -o mmF90_acc mm1_acc.F90 | |
mm: | |
9, Generating create(c(:,:)) | |
Generating create(b(:,:)) | |
Generating create(a(:,:)) | |
10, Generating present_or_create(c(:,:)) | |
Generating present_or_create(b(:,:)) | |
Generating present_or_create(a(:,:)) | |
Generating compute capability 1.3 binary | |
Generating compute capability 2.0 binary | |
11, Loop is parallelizable | |
12, Loop is parallelizable | |
Accelerator kernel generated | |
11, !$acc loop gang, vector(128) ! blockidx%x threadidx%x | |
12, !$acc loop gang ! blockidx%y | |
CC 1.3 : 13 registers; 40 shared, 8 constant, 0 local memory bytes | |
CC 2.0 : 19 registers; 0 shared, 56 constant, 0 local memory bytes | |
23, Generating present_or_create(c(:,:)) | |
Generating present_or_create(a(:,:)) | |
Generating present_or_create(b(:,:)) | |
Generating compute capability 1.3 binary | |
Generating compute capability 2.0 binary | |
24, Loop is parallelizable | |
25, Loop carried dependence of 'c' prevents parallelization | |
Loop carried backward dependence of 'c' prevents vectorization | |
26, Loop is parallelizable | |
Accelerator kernel generated | |
24, !$acc loop gang, vector(4) ! blockidx%y threadidx%y | |
26, !$acc loop gang, vector(64) ! blockidx%x threadidx%x | |
CC 1.3 : 19 registers; 40 shared, 12 constant, 0 local memory bytes | |
CC 2.0 : 26 registers; 0 shared, 56 constant, 0 local memory bytes | |
38, Accelerator kernel generated | |
38, CC 1.3 : 30 registers; 32 shared, 32 constant, 0 local memory bytes | |
CC 2.0 : 30 registers; 0 shared, 80 constant, 0 local memory bytes | |
39, !$acc loop gang ! blockidx%x | |
40, !$acc loop vector(256) ! threadidx%x | |
38, Generating present_or_create(c(:,:)) | |
Generating compute capability 1.3 binary | |
Generating compute capability 2.0 binary | |
40, Loop is parallelizable |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
$ aprun ./mmF90_acc | |
16777216 16777216 2.280605 3.387837 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment