Skip to content

Instantly share code, notes, and snippets.

@foxtran
Created October 9, 2024 16:40
Show Gist options
  • Save foxtran/ee0724f7f04dccda54f3a0b94d057a8f to your computer and use it in GitHub Desktop.
Save foxtran/ee0724f7f04dccda54f3a0b94d057a8f to your computer and use it in GitHub Desktop.
Perform B <- alpha * A^T + B and B <- A^T in Fortran in sub-optimal way with OpenMP parallelization. Given implementation is faster 3-10 times in comparison with naive implementation with BLAS-1 routines.
!>
!> @brief Perform B <- alpha * A^T + B
!>
!> @note OpenMP parallelization is used for faster matrix transposing.
!> The speed-up is about 3-10 times in comparison
!> with original implementation via BLAS-1 routines.
!>
!> @param[in] n size of square matrix
!> @param[in] alpha scaling factor
!> @param[in] mat n-by-n matrix A
!> @param[inout] newmat n-by-n updated matrix B
!>
!> @author Igor S. Gerasimov
!>
subroutine tradd(n,alpha,mat,newmat)
integer, intent(in) :: n
real(8), intent(in) :: alpha
real(8), intent(in) :: mat(n,n)
real(8), intent(inout) :: newmat(n,n)
!
integer :: i
!
if (n <= 768) then
C$OMP PARALLEL DO DEFAULT(NONE)
C$OMP& SHARED(alpha, mat, newmat, n) PRIVATE(i)
C$OMP& schedule(guided, 16) if(n > 32)
do i = 1, n
newmat(i,:) = alpha * mat(:,i) + newmat(i,:)
end do
C$OMP END PARALLEL DO
else
block
integer, parameter :: block_size = 128
real(8) :: tmp(block_size, block_size)
integer :: j, k, l, imax, jmax, iel, jel
C$OMP PARALLEL DO DEFAULT(NONE)
C$OMP& SHARED(alpha, mat, newmat, n)
C$OMP& PRIVATE(i, j, k, l, imax, jmax, iel, jel, tmp)
C$OMP& schedule(guided, 8) collapse(2)
do i = 1, n, block_size
do j = 1, n, block_size
imax = min(i + block_size - 1, n)
jmax = min(j + block_size - 1, n)
iel = imax - i + 1
jel = jmax - j + 1
tmp(1:iel,1:jel) = mat(i:imax,j:jmax)
do k = 1, iel
do l = 1, jel
newmat(j + l - 1, i + k - 1) =
$ alpha * tmp(k,l) + newmat(j + l - 1, i + k - 1)
end do
end do
end do
end do
C$OMP END PARALLEL DO
end block
end if
end subroutine tradd
!>
!> @brief Perform B <- A^T
!>
!> @note OpenMP parallelization is used for faster matrix transposing.
!> The speed-up is about 3-10 times in comparison
!> with original implementation via BLAS-1 routines.
!>
!> @param[in] n size of square matrix
!> @param[in] mat n-by-n matrix A
!> @param[inout] newmat n-by-n matrix B
!>
!> @author Igor S. Gerasimov
!>
subroutine tr(n,mat,newmat)
integer, intent(in) :: n
real(8), intent(in) :: mat(n,n)
real(8), intent(inout) :: newmat(n,n)
!
integer :: i
!
if (n <= 1024) then
#ifdef MKL
call mkl_domatcopy('C', 'T', n, n, 1.0_8, mat, n, newmat, n)
#else
C$OMP PARALLEL DO DEFAULT(NONE)
C$OMP& SHARED(newmat, mat, n) PRIVATE(i)
C$OMP& schedule(guided, 16) if(n > 32)
do i = 1, n
newmat(i,:) = mat(:,i)
end do
C$OMP END PARALLEL DO
#endif
else
block
integer, parameter :: block_size = 128
real(8) :: tmp(block_size, block_size)
integer :: j, k, l, imax, jmax, iel, jel
C$OMP PARALLEL DO DEFAULT(NONE)
C$OMP& SHARED(newmat, mat, n)
C$OMP& PRIVATE(i, j, k, l, imax, jmax, iel, jel, tmp)
C$OMP& schedule(guided, 8) collapse(2)
do i = 1, n, block_size
do j = 1, n, block_size
imax = min(i + block_size - 1, n)
jmax = min(j + block_size - 1, n)
iel = imax - i + 1
jel = jmax - j + 1
tmp(1:iel,1:jel) = mat(i:imax,j:jmax)
do k = 1, iel
do l = 1, jel
newmat(j + l - 1, i + k - 1) = tmp(k,l)
end do
end do
end do
end do
C$OMP END PARALLEL DO
end block
end if
end subroutine tr
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment