ivan-pi · March 4, 2025 22:18
diff --git a/abstraction_penalty.F90 b/abstraction_penalty.F90
 ! abstraction_penalty.F90 --
 !     A Fortran version of the Stepanov Abstraction Penalty Benchmark
 !
 ! The aim of this benchmark is to verify how efficiently
 ! compilers can deal with Fortran array semantics and data abstraction.
 !
 ! The structure of the benchmark is quite simple. It adds 2000
 ! doubles in an array 25000 times. Since arrays are first-class citizens
 ! in Fortran, it is expected that a compiler can simply create the
 ! an array descriptor without the creation of temporary array copies
 !
 ! 0 - uses a simple loop and assumed-size arrays (F77-style)
 ! 1-9 use assumed-shape arrays of doubles
 ! 10-13 use assumed-shape arrays of type(dd) - double wrapped in derived type
 ! 14-15 use assumed-shape arrays of type(ddd) - type(dd) wrapped in a derived type
 ! 16-21 use arrays of type(ddi) - a child type of type(dd)
 ! 22-23 use pointer arrays of class(dd)
 !
 ! Some of the variants use pointer arrays, some use component selection,
 ! and some use both. Note that within extended types, the parent is also
 ! a component.
 !
 ! By default, the overloaded sums are computing using a naive sum algorithm.
 ! This can be modified using preprocessor options:
 !
 !   -DUSE_INTRINSIC_SUM            use the built-in sum() function
 !   -DUSE_INTRINSIC_REDUCE         use the builtin reduce() function
 !   -DUSE_STRUCTURE_CONSTRUCTOR    use of the structure constructor
 !                                  for the addition result
 !
 ! The test result is only meaningful if the compiler didn't create a
 ! contiguous array copy. To check this use the flag
 !
 !   -DCHECK_ARRAY_TEMP             check if temporary arrays are used
 !
 ! When checking array temporaries it is recommended to cross-check with
 ! the built-in compiler options such as,
 !
 !   * -fcheck=array-temps (gfortran)
 !   * -check arg_temp_created (ifx/ifort)
 !
 module dd_mod

    use, intrinsic :: iso_c_binding, only: c_double
    implicit none

    integer, parameter :: dp = c_double

    ! Double wrapper
    type :: dd
        real(dp) :: val
    end type

    ! Double wrapper child with TBP
    type, extends(dd) :: ddi
    contains
        procedure :: get => get_ddi_val
    end type

    ! Double wrapper wrapper
    type :: ddd
        type(dd) :: val
    end type

    ! Double wrapper wrapper child with TBP
    type, extends(ddd) :: dddi
    contains
        procedure :: get => get_dddi_val
    end type

    interface operator(+)
        module procedure dd_add
        module procedure ddd_add
    end interface

 #if defined(_OPENMP) && _OPENMP >= 201811
 #define USE_OMP_USER_DEFINED_REDUCTIONS 1
 #else
 #define USE_OMP_USER_DEFINED_REDUCTIONS 0
 #endif

 #if USE_OMP_USER_DEFINED_REDUCTION
 !$omp declare reduction(+:dd,ddd:omp_out=omp_out+omp_in)
 #endif

    interface dsum
        module procedure dp_sum
        module procedure dd_sum
        module procedure ddd_sum
    end interface

 #if defined(USE_INTRINSIC_SUM) && defined(USE_INTRINSIC_REDUCE)
 #error "Use either sum or reduce, not both!"
 #endif

    interface operator(/=)
        module procedure dd_neq
        module procedure ddd_neq
    end interface

 contains

    subroutine sum_info
 #if USE_INTRINSIC_SUM
        print '(A)', "[info] using intrinsic sum"
 #elif USE_INTRINSIC_REDUCE
        print '(A)', "[info] using intrinsic reduce"
 #else
 #if USE_OMP_USER_DEFINED_REDUCTIONS
        print '(A)', "[info] using naive sum with OpenMP user-defined reduction"
 #else
        print '(A)', "[info] using naive sum"
 #endif
 #if USE_STRUCTURE_CONSTRUCTOR
        print '(A)', "[info] using structure constructor within addition functions"
 #endif
 #endif
    end subroutine

    pure function dp_add(a,b) result(c)
        real(dp), intent(in) :: a, b
        real(dp) :: c
        c = a + b
    end function

    pure function dp_sum(a) result(res)
        real(dp), intent(in) :: a(:)
        real(dp) :: res
 #if USE_INTRINSIC_SUM
        res = sum(a)
 #elif USE_INTRINSIC_REDUCE
        res = reduce(a,dp_add)
 #else
        integer :: i
        res = 0.0_dp
        !$omp simd reduction(+: res)
        do i = 1, size(a)
            res = res + a(i)
        end do
 #endif
    end function

    pure function dd_add(a,b) result(c)
        type(dd), intent(in) :: a, b
        type(dd) :: c
 #if USE_STRUCTURE_CONSTRUCTOR
        c = dd(a%val+b%val)
 #else
        c%val = a%val + b%val
 #endif
    end function

    pure function dd_sum(a) result(res)
        type(dd), intent(in) :: a(:)
        type(dd) :: res
 #if USE_INTRINSIC_SUM
 #if USE_STRUCTURE_CONSTRUCTOR
        res = dd(sum(a%val))
 #else
        res%val = sum(a%val)
 #endif
 #elif USE_INTRINSIC_REDUCE
        res = reduce(a,dd_add)
 #else
        integer :: i
        res = dd(0.0_dp)
 #if USE_OMP_USER_DEFINED_REDUCTIONS
        !$omp simd reduction(+: res)
 #endif
        do i = 1, size(a)
            res = res + a(i)
        end do
 #endif
    end function

    elemental function dd_neq(a,b) result(res)
        type(dd), intent(in) :: a
        real(dp), intent(in) :: b
        logical :: res
        res = a%val /= b
    end function

 ! --------

    pure function ddd_add(a,b) result(c)
        type(ddd), intent(in) :: a, b
        type(ddd) :: c
 #if USE_STRUCTURE_CONSTRUCTOR
        c = ddd(dd(a%val%val + b%val%val))
 #else
        c%val%val = a%val%val + b%val%val
 #endif
    end function

    pure function ddd_sum(a) result(res)
        type(ddd), intent(in) :: a(:)
        type(ddd) :: res
 #if USE_INTRINSIC_SUM
 #if USE_STRUCTURE_CONSTRUCTOR
        res = ddd(dd(sum(a%val%val)))
 #else
        res%val%val = sum(a%val%val)
 #endif
 #elif USE_INTRINSIC_REDUCE
        res = reduce(a,ddd_add)
 #else
        integer :: i
        res = ddd(dd(0.0_dp))
 #if USE_OMP_USER_DEFINED_REDUCTIONS
        !$omp simd reduction(+:res)
 #endif
        do i = 1, size(a)
            res = res + a(i)
        end do
 #endif
    end function

    elemental function ddd_neq(a,b) result(res)
        type(ddd), intent(in) :: a
        real(dp), intent(in) :: b
        logical :: res
        res = a%val%val /= b
    end function

 ! We add some trivial getter procedures to
 ! the child type, just to see if it changes anything

    pure function get_ddi_val(self) result(res)
        class(ddi), intent(in) :: self
        real(dp) :: res
        res = self%val
    end function
    pure function get_dddi_val(self) result(res)
        class(dddi), intent(in) :: self
        real(dp) :: res
        res = self%val%val
    end function

 end module

 program abstraction_penalty

    use, intrinsic :: iso_c_binding, only: c_loc, c_ptr, c_intptr_t
    use, intrinsic :: iso_fortran_env, only: ik => int64, &
        compiler_version, compiler_options

    use dd_mod

    implicit none

 #define NITER 25000
    integer :: niter = NITER
    character(32) :: str

    integer, parameter :: sz = 2000

    ! With 2000 elements, the size is roughly 16 KB
    ! and fits within the cache
    real(dp),  target :: data(sz)
    type(dd),  target :: dd_data(sz)
    type(ddd), target :: ddd_data(sz)
    type(ddi), target :: ddi_data(sz)
    type(dddi), target :: dddi_data(sz)

    real(dp),  pointer :: ptr(:)
    type(dd),  pointer :: dd_ptr(:)
    type(ddd), pointer :: ddd_ptr(:)
    type(ddi), pointer :: ddi_ptr(:)
    type(dddi), pointer :: dddi_ptr(:)

    class(dd), pointer :: dd_poly_ptr(:)
    class(ddd), pointer :: ddd_poly_ptr(:)

    interface test
        procedure :: test_dp
        procedure :: test_dd
        procedure :: test_ddd
    end interface

    integer, parameter :: TEST_SZ = 40
    real(dp) :: result_times(0:TEST_SZ-1)
    logical :: array_temp(0:TEST_SZ-1)
    integer :: k, i, itest = 0

    real(dp), parameter :: init_value = 3.0_dp
    type(c_ptr) :: data_loc

    ! Assume no array temporaries are used
    array_temp = .false.

    if (command_argument_count() > 0) then
        call get_command_argument(1,str)
        read(str,*) niter
    end if

    print '(A)', "[info] compiler: "//compiler_version()
    print '(A)', "[info] compiler options: "//compiler_options()

 #if CHECK_ARRAY_TEMP
    print '(A)', "[info] temporary array checking is ON"
 #endif

    ! Information about the summation procedure
    call sum_info

    print '(A,I0)', "[info] number of iterations: ", niter

    data = init_value

    ! Initialize via structure constructor (elemental)
    dd_data = dd(init_value)
    ddd_data = ddd(dd(init_value))
    ddi_data = ddi(init_value)
    dddi_data = dddi(dd(init_value))

 !    call test0(size(data),data)         ! Cache warm-up

    itest = 0
    call test0(size(data),data)         ! #0

 ! For temporary array checking we should
 ! pass a pointer to the first element

 #if CHECK_ARRAY_TEMP
 #define PTR(a) data_loc = c_loc(a)
 #else
 #define PTR(a)
 #endif

 #define PTR_AND_TEST(a1,array) \
    PTR(a1); \
    call test(array)

    PTR_AND_TEST(data(1),data) ! #1

 ! --- Component selection

    PTR_AND_TEST(dd_data(1),dd_data%val)        ! #2
    PTR_AND_TEST(ddd_data(1),ddd_data%val%val)   ! #3

 ! --- Pointers (4-6)

    ptr => data
    PTR_AND_TEST(ptr(1),ptr)
    ptr => dd_data%val
    PTR_AND_TEST(ptr(1),ptr)
    ptr => ddd_data%val%val
    PTR_AND_TEST(ptr(1),ptr)

 ! --- Pointers and component selection (7-9)

    dd_ptr => dd_data
    PTR_AND_TEST(dd_ptr(1),dd_ptr%val)
    dd_ptr => ddd_data%val
    PTR_AND_TEST(dd_ptr(1),dd_ptr%val)
    ddd_ptr => ddd_data
    PTR_AND_TEST(ddd_ptr(1),ddd_ptr%val%val)

 ! --- Single composition (10-13)

    PTR_AND_TEST(dd_data(1),dd_data)
    PTR_AND_TEST(ddd_data(1),ddd_data%val)

    dd_ptr => dd_data
    PTR_AND_TEST(dd_ptr(1),dd_ptr)
    dd_ptr => ddd_data%val
    PTR_AND_TEST(dd_ptr(1),dd_ptr)

 ! --- Double composition (14-15)

    PTR_AND_TEST(ddd_data(1),ddd_data)

    ddd_ptr => ddd_data
    PTR_AND_TEST(ddd_data(1),ddd_ptr)

 ! --- Child type of dd (16-18,19-21)

    PTR_AND_TEST(ddi_data(1),ddi_data%val)
    PTR_AND_TEST(ddi_data(1),ddi_data%dd%val)
    PTR_AND_TEST(ddi_data(1),ddi_data%dd)

    ddi_ptr => ddi_data
    PTR_AND_TEST(ddi_ptr(1),ddi_ptr%val)
    PTR_AND_TEST(ddi_ptr(1),ddi_ptr%dd%val)
    PTR_AND_TEST(ddi_ptr(1),ddi_ptr%dd)

 ! --- Polymorphic variables (22-23)

    do k = 1, 2
        if (k == 1) dd_poly_ptr => dd_data
        if (k == 2) dd_poly_ptr => ddi_data
        select type(dd_poly_ptr)
        type is (dd)
            PTR_AND_TEST(dd_poly_ptr(1),dd_poly_ptr)
        type is (ddi)
            PTR_AND_TEST(dd_poly_ptr(1),dd_poly_ptr%dd)
        class default
            error stop "[error] Unexpected type."
        end select
    end do

 ! --- Child type of ddd (24-27)

    PTR_AND_TEST(dddi_data(1),dddi_data%val%val)
    PTR_AND_TEST(dddi_data(1),dddi_data%ddd%val%val)

    dddi_ptr => dddi_data
    PTR_AND_TEST(dddi_ptr(1),dddi_ptr%val%val)
    PTR_AND_TEST(dddi_ptr(1),dddi_ptr%ddd%val%val)

 ! --- Polymorphic variables (28-29)

    do k = 1, 2
        if (k == 1) ddd_poly_ptr => ddd_data
        if (k == 2) ddd_poly_ptr => dddi_data
        select type(ddd_poly_ptr)
        type is (ddd)
            PTR_AND_TEST(ddd_poly_ptr(1),ddd_poly_ptr)
        type is (dddi)
            PTR_AND_TEST(ddd_poly_ptr(1),ddd_poly_ptr%ddd)
        class default
            error stop "[error] Unexpected type."
        end select
    end do

 ! *********************

 #if CHECK_ARRAY_TEMP
    if (any(array_temp)) then
        print '(A,I0,"/",I0,A)', &
            "[warn] potential use of temporary arrays in ", &
            count(array_temp), itest, " tests"
        print '(A,*(G0,:,", "))', "[warn] affected tests: ", &
            pack([integer :: (i,i=0,itest-1)],mask=array_temp(0:itest-1))
    end if
 #endif

    call summarize

 contains

    !> The reference case is the plain old Fortran case
    impure subroutine test0(n,array)
        integer, intent(in) :: n
        real(dp), intent(in) :: array(*)
        integer(ik) :: tstart, tend, clock_rate
        real(dp) :: res
        integer :: i, j
        call system_clock(tstart)
        do i = 1, niter
 #if USE_INTRINSIC_SUM
            res = sum(array(1:n))
 #elif USE_INTRINSIC_REDUCE
            res = reduce(array(1:n),dp_add)
 #else
            res = 0.0_dp
            do j = 1, n
                res = res + array(j)
            end do
 #endif
            if (res /= sz*init_value) then
                print '("test ",I0," failed")', itest
            end if
        end do
        call system_clock(tend,clock_rate)
        result_times(itest) = real(tend - tstart,dp)/real(clock_rate,dp)
        itest = itest + 1
    end subroutine

 #if CHECK_ARRAY_TEMP
    logical function check_loc(smth)
        type(*), intent(in), target :: smth(:)
        check_loc = transfer(data_loc,1_c_intptr_t) /= &
            transfer(c_loc(smth),1_c_intptr_t)
    end function
 #endif

    !> Check summation as an assumed shape array argument
    !>   N.b. some compilers create temporary arrays; in that
    !>   case the test is meaningless
    impure subroutine test_dp(array)
        real(dp), intent(in) :: array(:)
        integer(ik) :: tstart, tend, clock_rate
        real(dp) :: res
        integer :: i
 #if CHECK_ARRAY_TEMP
        TARGET :: array
        array_temp(itest) = check_loc(array)
 #endif
        call system_clock(tstart)
        do i = 1, niter
            res = sum(array)
            if (res /= sz*init_value) then
                print '("test ",I0," failed")', itest
            end if
        end do
        call system_clock(tend,clock_rate)
        result_times(itest) = real(tend - tstart,dp)/real(clock_rate,dp)
        itest = itest + 1
    end subroutine

    impure subroutine test_dd(array)
        type(dd), intent(in) :: array(:)
        integer(ik) :: tstart, tend, clock_rate
        type(dd) :: res
        integer :: i
 #if CHECK_ARRAY_TEMP
        TARGET :: array
        array_temp(itest) = check_loc(array)
 #endif
        call system_clock(tstart)
        do i = 1, niter
            res = dsum(array)
            if (res /= sz*init_value) then
                print '("test ",I0," failed")', itest
            end if
        end do
        call system_clock(tend,clock_rate)
        result_times(itest) = real(tend - tstart,dp)/real(clock_rate,dp)
        itest = itest + 1
    end subroutine

    impure subroutine test_ddd(array)
        type(ddd), intent(in) :: array(:)
        integer(ik) :: tstart, tend, clock_rate
        type(ddd) :: res
        integer :: i
 #if CHECK_ARRAY_TEMP
        TARGET :: array
        array_temp(itest) = check_loc(array)
 #endif
        call system_clock(tstart)
        do i = 1, niter
            res = dsum(array)
            if (res /= sz*init_value) then
                print '("test ",I0," failed")', itest
            end if
        end do
        call system_clock(tend,clock_rate)
        result_times(itest) = real(tend - tstart,dp)/real(clock_rate,dp)
        itest = itest + 1
    end subroutine

    subroutine summarize()

        real(dp) :: millions, total_time, gmean_times, gmean_rate, gmean_ratio
        real(dp), allocatable :: times(:)
        integer :: i, ntest

        if (itest > TEST_SZ-1) error stop "[error] exceeded result table size"

        write(*,'(/,4A12)') "test",   "absolute",   "additions",  "ratio with"
        write(*,'(4A12,/)') "number", "time (sec)", "per second", "test0"

        millions = real(sz,dp) * niter / 1.0e6_dp

        do i = 0, itest-1
 #if CHECK_ARRAY_TEMP
            if (array_temp(i)) cycle
 #endif
            write(*,'(I12, F12.4, ES12.3, F12.3)') &
                i, result_times(i), millions/result_times(i), &
                result_times(i)/result_times(0)
        end do

 #if CHECK_ARRAY_TEMP
        times = pack(result_times(0:itest-1), &
                    mask=.not.array_temp(0:itest-1))
 #else
        times = result_times(0:itest-1)
 #endif
        ntest = size(times)

        total_time = sum(times)
        gmean_times = sum(log(times))
        gmean_rate = sum(log(millions/times))
        gmean_ratio = sum(log(times/times(1)))

        write(*, '(A)') repeat('-',4*12)
        write(*, '(A12, F12.4, ES12.3, F12.2)') "mean", &
            exp(gmean_times/ntest), &
            exp(gmean_rate/ntest), &
            exp(gmean_ratio/ntest)

        write(*,'(/,"Total time: ", F0.2, " sec")') total_time
        write(*,'("Abstraction Penalty: ", F0.2, /)') exp(gmean_ratio/ntest)

    end subroutine

 end program
	! abstraction_penalty.F90 --
	! A Fortran version of the Stepanov Abstraction Penalty Benchmark
	!
	! The aim of this benchmark is to verify how efficiently
	! compilers can deal with Fortran array semantics and data abstraction.
	!
	! The structure of the benchmark is quite simple. It adds 2000
	! doubles in an array 25000 times. Since arrays are first-class citizens
	! in Fortran, it is expected that a compiler can simply create the
	! an array descriptor without the creation of temporary array copies
	!
	! 0 - uses a simple loop and assumed-size arrays (F77-style)
	! 1-9 use assumed-shape arrays of doubles
	! 10-13 use assumed-shape arrays of type(dd) - double wrapped in derived type
	! 14-15 use assumed-shape arrays of type(ddd) - type(dd) wrapped in a derived type
	! 16-21 use arrays of type(ddi) - a child type of type(dd)
	! 22-23 use pointer arrays of class(dd)
	!
	! Some of the variants use pointer arrays, some use component selection,
	! and some use both. Note that within extended types, the parent is also
	! a component.
	!
	! By default, the overloaded sums are computing using a naive sum algorithm.
	! This can be modified using preprocessor options:
	!
	! -DUSE_INTRINSIC_SUM use the built-in sum() function
	! -DUSE_INTRINSIC_REDUCE use the builtin reduce() function
	! -DUSE_STRUCTURE_CONSTRUCTOR use of the structure constructor
	! for the addition result
	!
	! The test result is only meaningful if the compiler didn't create a
	! contiguous array copy. To check this use the flag
	!
	! -DCHECK_ARRAY_TEMP check if temporary arrays are used
	!
	! When checking array temporaries it is recommended to cross-check with
	! the built-in compiler options such as,
	!
	! * -fcheck=array-temps (gfortran)
	! * -check arg_temp_created (ifx/ifort)
	!
	module dd_mod

	use, intrinsic :: iso_c_binding, only: c_double
	implicit none

	integer, parameter :: dp = c_double

	! Double wrapper
	type :: dd
	real(dp) :: val
	end type

	! Double wrapper child with TBP
	type, extends(dd) :: ddi
	contains
	procedure :: get => get_ddi_val
	end type

	! Double wrapper wrapper
	type :: ddd
	type(dd) :: val
	end type

	! Double wrapper wrapper child with TBP
	type, extends(ddd) :: dddi
	contains
	procedure :: get => get_dddi_val
	end type

	interface operator(+)
	module procedure dd_add
	module procedure ddd_add
	end interface

	#if defined(_OPENMP) && _OPENMP >= 201811
	#define USE_OMP_USER_DEFINED_REDUCTIONS 1
	#else
	#define USE_OMP_USER_DEFINED_REDUCTIONS 0
	#endif

	#if USE_OMP_USER_DEFINED_REDUCTION
	!$omp declare reduction(+:dd,ddd:omp_out=omp_out+omp_in)
	#endif

	interface dsum
	module procedure dp_sum
	module procedure dd_sum
	module procedure ddd_sum
	end interface

	#if defined(USE_INTRINSIC_SUM) && defined(USE_INTRINSIC_REDUCE)
	#error "Use either sum or reduce, not both!"
	#endif

	interface operator(/=)
	module procedure dd_neq
	module procedure ddd_neq
	end interface

	contains

	subroutine sum_info
	#if USE_INTRINSIC_SUM
	print '(A)', "[info] using intrinsic sum"
	#elif USE_INTRINSIC_REDUCE
	print '(A)', "[info] using intrinsic reduce"
	#else
	#if USE_OMP_USER_DEFINED_REDUCTIONS
	print '(A)', "[info] using naive sum with OpenMP user-defined reduction"
	#else
	print '(A)', "[info] using naive sum"
	#endif
	#if USE_STRUCTURE_CONSTRUCTOR
	print '(A)', "[info] using structure constructor within addition functions"
	#endif
	#endif
	end subroutine

	pure function dp_add(a,b) result(c)
	real(dp), intent(in) :: a, b
	real(dp) :: c
	c = a + b
	end function

	pure function dp_sum(a) result(res)
	real(dp), intent(in) :: a(:)
	real(dp) :: res
	#if USE_INTRINSIC_SUM
	res = sum(a)
	#elif USE_INTRINSIC_REDUCE
	res = reduce(a,dp_add)
	#else
	integer :: i
	res = 0.0_dp
	!$omp simd reduction(+: res)
	do i = 1, size(a)
	res = res + a(i)
	end do
	#endif
	end function

	pure function dd_add(a,b) result(c)
	type(dd), intent(in) :: a, b
	type(dd) :: c
	#if USE_STRUCTURE_CONSTRUCTOR
	c = dd(a%val+b%val)
	#else
	c%val = a%val + b%val
	#endif
	end function

	pure function dd_sum(a) result(res)
	type(dd), intent(in) :: a(:)
	type(dd) :: res
	#if USE_INTRINSIC_SUM
	#if USE_STRUCTURE_CONSTRUCTOR
	res = dd(sum(a%val))
	#else
	res%val = sum(a%val)
	#endif
	#elif USE_INTRINSIC_REDUCE
	res = reduce(a,dd_add)
	#else
	integer :: i
	res = dd(0.0_dp)
	#if USE_OMP_USER_DEFINED_REDUCTIONS
	!$omp simd reduction(+: res)
	#endif
	do i = 1, size(a)
	res = res + a(i)
	end do
	#endif
	end function

	elemental function dd_neq(a,b) result(res)
	type(dd), intent(in) :: a
	real(dp), intent(in) :: b
	logical :: res
	res = a%val /= b
	end function

	! --------

	pure function ddd_add(a,b) result(c)
	type(ddd), intent(in) :: a, b
	type(ddd) :: c
	#if USE_STRUCTURE_CONSTRUCTOR
	c = ddd(dd(a%val%val + b%val%val))
	#else
	c%val%val = a%val%val + b%val%val
	#endif
	end function

	pure function ddd_sum(a) result(res)
	type(ddd), intent(in) :: a(:)
	type(ddd) :: res
	#if USE_INTRINSIC_SUM
	#if USE_STRUCTURE_CONSTRUCTOR
	res = ddd(dd(sum(a%val%val)))
	#else
	res%val%val = sum(a%val%val)
	#endif
	#elif USE_INTRINSIC_REDUCE
	res = reduce(a,ddd_add)
	#else
	integer :: i
	res = ddd(dd(0.0_dp))
	#if USE_OMP_USER_DEFINED_REDUCTIONS
	!$omp simd reduction(+:res)
	#endif
	do i = 1, size(a)
	res = res + a(i)
	end do
	#endif
	end function

	elemental function ddd_neq(a,b) result(res)
	type(ddd), intent(in) :: a
	real(dp), intent(in) :: b
	logical :: res
	res = a%val%val /= b
	end function

	! We add some trivial getter procedures to
	! the child type, just to see if it changes anything

	pure function get_ddi_val(self) result(res)
	class(ddi), intent(in) :: self
	real(dp) :: res
	res = self%val
	end function
	pure function get_dddi_val(self) result(res)
	class(dddi), intent(in) :: self
	real(dp) :: res
	res = self%val%val
	end function

	end module

	program abstraction_penalty

	use, intrinsic :: iso_c_binding, only: c_loc, c_ptr, c_intptr_t
	use, intrinsic :: iso_fortran_env, only: ik => int64, &
	compiler_version, compiler_options

	use dd_mod

	implicit none

	#define NITER 25000
	integer :: niter = NITER
	character(32) :: str

	integer, parameter :: sz = 2000

	! With 2000 elements, the size is roughly 16 KB
	! and fits within the cache
	real(dp), target :: data(sz)
	type(dd), target :: dd_data(sz)
	type(ddd), target :: ddd_data(sz)
	type(ddi), target :: ddi_data(sz)
	type(dddi), target :: dddi_data(sz)

	real(dp), pointer :: ptr(:)
	type(dd), pointer :: dd_ptr(:)
	type(ddd), pointer :: ddd_ptr(:)
	type(ddi), pointer :: ddi_ptr(:)
	type(dddi), pointer :: dddi_ptr(:)

	class(dd), pointer :: dd_poly_ptr(:)
	class(ddd), pointer :: ddd_poly_ptr(:)

	interface test
	procedure :: test_dp
	procedure :: test_dd
	procedure :: test_ddd
	end interface

	integer, parameter :: TEST_SZ = 40
	real(dp) :: result_times(0:TEST_SZ-1)
	logical :: array_temp(0:TEST_SZ-1)
	integer :: k, i, itest = 0

	real(dp), parameter :: init_value = 3.0_dp
	type(c_ptr) :: data_loc

	! Assume no array temporaries are used
	array_temp = .false.

	if (command_argument_count() > 0) then
	call get_command_argument(1,str)
	read(str,*) niter
	end if

	print '(A)', "[info] compiler: "//compiler_version()
	print '(A)', "[info] compiler options: "//compiler_options()

	#if CHECK_ARRAY_TEMP
	print '(A)', "[info] temporary array checking is ON"
	#endif

	! Information about the summation procedure
	call sum_info

	print '(A,I0)', "[info] number of iterations: ", niter

	data = init_value

	! Initialize via structure constructor (elemental)
	dd_data = dd(init_value)
	ddd_data = ddd(dd(init_value))
	ddi_data = ddi(init_value)
	dddi_data = dddi(dd(init_value))

	! call test0(size(data),data) ! Cache warm-up

	itest = 0
	call test0(size(data),data) ! #0

	! For temporary array checking we should
	! pass a pointer to the first element

	#if CHECK_ARRAY_TEMP
	#define PTR(a) data_loc = c_loc(a)
	#else
	#define PTR(a)
	#endif

	#define PTR_AND_TEST(a1,array) \
	PTR(a1); \
	call test(array)

	PTR_AND_TEST(data(1),data) ! #1

	! --- Component selection

	PTR_AND_TEST(dd_data(1),dd_data%val) ! #2
	PTR_AND_TEST(ddd_data(1),ddd_data%val%val) ! #3

	! --- Pointers (4-6)

	ptr => data
	PTR_AND_TEST(ptr(1),ptr)
	ptr => dd_data%val
	PTR_AND_TEST(ptr(1),ptr)
	ptr => ddd_data%val%val
	PTR_AND_TEST(ptr(1),ptr)

	! --- Pointers and component selection (7-9)

	dd_ptr => dd_data
	PTR_AND_TEST(dd_ptr(1),dd_ptr%val)
	dd_ptr => ddd_data%val
	PTR_AND_TEST(dd_ptr(1),dd_ptr%val)
	ddd_ptr => ddd_data
	PTR_AND_TEST(ddd_ptr(1),ddd_ptr%val%val)

	! --- Single composition (10-13)

	PTR_AND_TEST(dd_data(1),dd_data)
	PTR_AND_TEST(ddd_data(1),ddd_data%val)

	dd_ptr => dd_data
	PTR_AND_TEST(dd_ptr(1),dd_ptr)
	dd_ptr => ddd_data%val
	PTR_AND_TEST(dd_ptr(1),dd_ptr)

	! --- Double composition (14-15)

	PTR_AND_TEST(ddd_data(1),ddd_data)

	ddd_ptr => ddd_data
	PTR_AND_TEST(ddd_data(1),ddd_ptr)

	! --- Child type of dd (16-18,19-21)

	PTR_AND_TEST(ddi_data(1),ddi_data%val)
	PTR_AND_TEST(ddi_data(1),ddi_data%dd%val)
	PTR_AND_TEST(ddi_data(1),ddi_data%dd)

	ddi_ptr => ddi_data
	PTR_AND_TEST(ddi_ptr(1),ddi_ptr%val)
	PTR_AND_TEST(ddi_ptr(1),ddi_ptr%dd%val)
	PTR_AND_TEST(ddi_ptr(1),ddi_ptr%dd)

	! --- Polymorphic variables (22-23)

	do k = 1, 2
	if (k == 1) dd_poly_ptr => dd_data
	if (k == 2) dd_poly_ptr => ddi_data
	select type(dd_poly_ptr)
	type is (dd)
	PTR_AND_TEST(dd_poly_ptr(1),dd_poly_ptr)
	type is (ddi)
	PTR_AND_TEST(dd_poly_ptr(1),dd_poly_ptr%dd)
	class default
	error stop "[error] Unexpected type."
	end select
	end do

	! --- Child type of ddd (24-27)

	PTR_AND_TEST(dddi_data(1),dddi_data%val%val)
	PTR_AND_TEST(dddi_data(1),dddi_data%ddd%val%val)

	dddi_ptr => dddi_data
	PTR_AND_TEST(dddi_ptr(1),dddi_ptr%val%val)
	PTR_AND_TEST(dddi_ptr(1),dddi_ptr%ddd%val%val)

	! --- Polymorphic variables (28-29)

	do k = 1, 2
	if (k == 1) ddd_poly_ptr => ddd_data
	if (k == 2) ddd_poly_ptr => dddi_data
	select type(ddd_poly_ptr)
	type is (ddd)
	PTR_AND_TEST(ddd_poly_ptr(1),ddd_poly_ptr)
	type is (dddi)
	PTR_AND_TEST(ddd_poly_ptr(1),ddd_poly_ptr%ddd)
	class default
	error stop "[error] Unexpected type."
	end select
	end do

	! *********************

	#if CHECK_ARRAY_TEMP
	if (any(array_temp)) then
	print '(A,I0,"/",I0,A)', &
	"[warn] potential use of temporary arrays in ", &
	count(array_temp), itest, " tests"
	print '(A,*(G0,:,", "))', "[warn] affected tests: ", &
	pack([integer :: (i,i=0,itest-1)],mask=array_temp(0:itest-1))
	end if
	#endif

	call summarize

	contains

	!> The reference case is the plain old Fortran case
	impure subroutine test0(n,array)
	integer, intent(in) :: n
	real(dp), intent(in) :: array(*)
	integer(ik) :: tstart, tend, clock_rate
	real(dp) :: res
	integer :: i, j
	call system_clock(tstart)
	do i = 1, niter
	#if USE_INTRINSIC_SUM
	res = sum(array(1:n))
	#elif USE_INTRINSIC_REDUCE
	res = reduce(array(1:n),dp_add)
	#else
	res = 0.0_dp
	do j = 1, n
	res = res + array(j)
	end do
	#endif
	if (res /= sz*init_value) then
	print '("test ",I0," failed")', itest
	end if
	end do
	call system_clock(tend,clock_rate)
	result_times(itest) = real(tend - tstart,dp)/real(clock_rate,dp)
	itest = itest + 1
	end subroutine

	#if CHECK_ARRAY_TEMP
	logical function check_loc(smth)
	type(*), intent(in), target :: smth(:)
	check_loc = transfer(data_loc,1_c_intptr_t) /= &
	transfer(c_loc(smth),1_c_intptr_t)
	end function
	#endif

	!> Check summation as an assumed shape array argument
	!> N.b. some compilers create temporary arrays; in that
	!> case the test is meaningless
	impure subroutine test_dp(array)
	real(dp), intent(in) :: array(:)
	integer(ik) :: tstart, tend, clock_rate
	real(dp) :: res
	integer :: i
	#if CHECK_ARRAY_TEMP
	TARGET :: array
	array_temp(itest) = check_loc(array)
	#endif
	call system_clock(tstart)
	do i = 1, niter
	res = sum(array)
	if (res /= sz*init_value) then
	print '("test ",I0," failed")', itest
	end if
	end do
	call system_clock(tend,clock_rate)
	result_times(itest) = real(tend - tstart,dp)/real(clock_rate,dp)
	itest = itest + 1
	end subroutine

	impure subroutine test_dd(array)
	type(dd), intent(in) :: array(:)
	integer(ik) :: tstart, tend, clock_rate
	type(dd) :: res
	integer :: i
	#if CHECK_ARRAY_TEMP
	TARGET :: array
	array_temp(itest) = check_loc(array)
	#endif
	call system_clock(tstart)
	do i = 1, niter
	res = dsum(array)
	if (res /= sz*init_value) then
	print '("test ",I0," failed")', itest
	end if
	end do
	call system_clock(tend,clock_rate)
	result_times(itest) = real(tend - tstart,dp)/real(clock_rate,dp)
	itest = itest + 1
	end subroutine

	impure subroutine test_ddd(array)
	type(ddd), intent(in) :: array(:)
	integer(ik) :: tstart, tend, clock_rate
	type(ddd) :: res
	integer :: i
	#if CHECK_ARRAY_TEMP
	TARGET :: array
	array_temp(itest) = check_loc(array)
	#endif
	call system_clock(tstart)
	do i = 1, niter
	res = dsum(array)
	if (res /= sz*init_value) then
	print '("test ",I0," failed")', itest
	end if
	end do
	call system_clock(tend,clock_rate)
	result_times(itest) = real(tend - tstart,dp)/real(clock_rate,dp)
	itest = itest + 1
	end subroutine

	subroutine summarize()

	real(dp) :: millions, total_time, gmean_times, gmean_rate, gmean_ratio
	real(dp), allocatable :: times(:)
	integer :: i, ntest

	if (itest > TEST_SZ-1) error stop "[error] exceeded result table size"

	write(*,'(/,4A12)') "test", "absolute", "additions", "ratio with"
	write(*,'(4A12,/)') "number", "time (sec)", "per second", "test0"

	millions = real(sz,dp) * niter / 1.0e6_dp

	do i = 0, itest-1
	#if CHECK_ARRAY_TEMP
	if (array_temp(i)) cycle
	#endif
	write(*,'(I12, F12.4, ES12.3, F12.3)') &
	i, result_times(i), millions/result_times(i), &
	result_times(i)/result_times(0)
	end do

	#if CHECK_ARRAY_TEMP
	times = pack(result_times(0:itest-1), &
	mask=.not.array_temp(0:itest-1))
	#else
	times = result_times(0:itest-1)
	#endif
	ntest = size(times)

	total_time = sum(times)
	gmean_times = sum(log(times))
	gmean_rate = sum(log(millions/times))
	gmean_ratio = sum(log(times/times(1)))

	write(, '(A)') repeat('-',412)
	write(*, '(A12, F12.4, ES12.3, F12.2)') "mean", &
	exp(gmean_times/ntest), &
	exp(gmean_rate/ntest), &
	exp(gmean_ratio/ntest)

	write(*,'(/,"Total time: ", F0.2, " sec")') total_time
	write(*,'("Abstraction Penalty: ", F0.2, /)') exp(gmean_ratio/ntest)

	end subroutine

	end program