jefflarkin · August 10, 2023 15:34 · jdenny-ornl · Aug 10, 2023
diff --git a/00-intro.md b/00-intro.md
diff --git a/01-async-deallocation.f90 b/01-async-deallocation.f90
 subroutine work(A, N)
  integer :: i, N
  real, dimension(N), intent(inout) :: A
  real, dimension(N) :: B
  
  !$acc enter data create(B(:)) async(1)
  !$acc kernels async(1)
  B(:) = 1.0
  !$acc end kernels
  
  ! A device copy of B is created here.
  !$acc parallel loop present(A(1:N),B(1:N)) async(1)
  do i=1,N
    A(i) = A(i) + B(i)
  end do
  
  !$acc exit data delete(B) async(1)
  
  ! No synchronization here, so B is immediately deallocated on the host
  ! and (presumably) removed from the present table, deallocating it on 
  ! the device too. If the implementation tracks properly though, maybe
  ! the deallocation is delayed or they're using stream-ordered memory 
  ! allocation and freeing.
 end
diff --git a/02-async-deallocation-commentary.md b/02-async-deallocation-commentary.md
diff --git a/03-async-with-stack-variables.md b/03-async-with-stack-variables.md
diff --git a/04-async-with-stack-variable.cpp b/04-async-with-stack-variable.cpp
 void do_stufF_async(double *input, int N)
 {
  // Assume filter is too large to use firstprivate
  double filter[3] = { -1, 0, 1 };
  
  #pragma acc parallel loop copyin(filer[0:3]) copy(input[0:N]) async
  for ( int i = 0; i < N; i++ )
  {
    // apply filter
  }
  
  // no synchronization
 } // filter no longer exists and stack address may be reused
diff --git a/05-async-with-stack-variable-discussion.md b/05-async-with-stack-variable-discussion.md
	subroutine work(A, N)
	integer :: i, N
	real, dimension(N), intent(inout) :: A
	real, dimension(N) :: B

	!$acc enter data create(B(:)) async(1)
	!$acc kernels async(1)
	B(:) = 1.0
	!$acc end kernels

	! A device copy of B is created here.
	!$acc parallel loop present(A(1:N),B(1:N)) async(1)
	do i=1,N
	A(i) = A(i) + B(i)
	end do

	!$acc exit data delete(B) async(1)

	! No synchronization here, so B is immediately deallocated on the host
	! and (presumably) removed from the present table, deallocating it on
	! the device too. If the implementation tracks properly though, maybe
	! the deallocation is delayed or they're using stream-ordered memory
	! allocation and freeing.
	end
	void do_stufF_async(double *input, int N)
	{
	// Assume filter is too large to use firstprivate
	double filter[3] = { -1, 0, 1 };

	#pragma acc parallel loop copyin(filer[0:3]) copy(input[0:N]) async
	for ( int i = 0; i < N; i++ )
	{
	// apply filter
	}

	// no synchronization
	} // filter no longer exists and stack address may be reused