Created
October 9, 2024 16:40
-
-
Save foxtran/ee0724f7f04dccda54f3a0b94d057a8f to your computer and use it in GitHub Desktop.
Perform B <- alpha * A^T + B and B <- A^T in Fortran in sub-optimal way with OpenMP parallelization. Given implementation is faster 3-10 times in comparison with naive implementation with BLAS-1 routines.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
!> | |
!> @brief Perform B <- alpha * A^T + B | |
!> | |
!> @note OpenMP parallelization is used for faster matrix transposing. | |
!> The speed-up is about 3-10 times in comparison | |
!> with original implementation via BLAS-1 routines. | |
!> | |
!> @param[in] n size of square matrix | |
!> @param[in] alpha scaling factor | |
!> @param[in] mat n-by-n matrix A | |
!> @param[inout] newmat n-by-n updated matrix B | |
!> | |
!> @author Igor S. Gerasimov | |
!> | |
subroutine tradd(n,alpha,mat,newmat) | |
integer, intent(in) :: n | |
real(8), intent(in) :: alpha | |
real(8), intent(in) :: mat(n,n) | |
real(8), intent(inout) :: newmat(n,n) | |
! | |
integer :: i | |
! | |
if (n <= 768) then | |
C$OMP PARALLEL DO DEFAULT(NONE) | |
C$OMP& SHARED(alpha, mat, newmat, n) PRIVATE(i) | |
C$OMP& schedule(guided, 16) if(n > 32) | |
do i = 1, n | |
newmat(i,:) = alpha * mat(:,i) + newmat(i,:) | |
end do | |
C$OMP END PARALLEL DO | |
else | |
block | |
integer, parameter :: block_size = 128 | |
real(8) :: tmp(block_size, block_size) | |
integer :: j, k, l, imax, jmax, iel, jel | |
C$OMP PARALLEL DO DEFAULT(NONE) | |
C$OMP& SHARED(alpha, mat, newmat, n) | |
C$OMP& PRIVATE(i, j, k, l, imax, jmax, iel, jel, tmp) | |
C$OMP& schedule(guided, 8) collapse(2) | |
do i = 1, n, block_size | |
do j = 1, n, block_size | |
imax = min(i + block_size - 1, n) | |
jmax = min(j + block_size - 1, n) | |
iel = imax - i + 1 | |
jel = jmax - j + 1 | |
tmp(1:iel,1:jel) = mat(i:imax,j:jmax) | |
do k = 1, iel | |
do l = 1, jel | |
newmat(j + l - 1, i + k - 1) = | |
$ alpha * tmp(k,l) + newmat(j + l - 1, i + k - 1) | |
end do | |
end do | |
end do | |
end do | |
C$OMP END PARALLEL DO | |
end block | |
end if | |
end subroutine tradd | |
!> | |
!> @brief Perform B <- A^T | |
!> | |
!> @note OpenMP parallelization is used for faster matrix transposing. | |
!> The speed-up is about 3-10 times in comparison | |
!> with original implementation via BLAS-1 routines. | |
!> | |
!> @param[in] n size of square matrix | |
!> @param[in] mat n-by-n matrix A | |
!> @param[inout] newmat n-by-n matrix B | |
!> | |
!> @author Igor S. Gerasimov | |
!> | |
subroutine tr(n,mat,newmat) | |
integer, intent(in) :: n | |
real(8), intent(in) :: mat(n,n) | |
real(8), intent(inout) :: newmat(n,n) | |
! | |
integer :: i | |
! | |
if (n <= 1024) then | |
#ifdef MKL | |
call mkl_domatcopy('C', 'T', n, n, 1.0_8, mat, n, newmat, n) | |
#else | |
C$OMP PARALLEL DO DEFAULT(NONE) | |
C$OMP& SHARED(newmat, mat, n) PRIVATE(i) | |
C$OMP& schedule(guided, 16) if(n > 32) | |
do i = 1, n | |
newmat(i,:) = mat(:,i) | |
end do | |
C$OMP END PARALLEL DO | |
#endif | |
else | |
block | |
integer, parameter :: block_size = 128 | |
real(8) :: tmp(block_size, block_size) | |
integer :: j, k, l, imax, jmax, iel, jel | |
C$OMP PARALLEL DO DEFAULT(NONE) | |
C$OMP& SHARED(newmat, mat, n) | |
C$OMP& PRIVATE(i, j, k, l, imax, jmax, iel, jel, tmp) | |
C$OMP& schedule(guided, 8) collapse(2) | |
do i = 1, n, block_size | |
do j = 1, n, block_size | |
imax = min(i + block_size - 1, n) | |
jmax = min(j + block_size - 1, n) | |
iel = imax - i + 1 | |
jel = jmax - j + 1 | |
tmp(1:iel,1:jel) = mat(i:imax,j:jmax) | |
do k = 1, iel | |
do l = 1, jel | |
newmat(j + l - 1, i + k - 1) = tmp(k,l) | |
end do | |
end do | |
end do | |
end do | |
C$OMP END PARALLEL DO | |
end block | |
end if | |
end subroutine tr |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment