ehermes · August 10, 2015 18:33
diff --git a/d3_fort.f90 b/d3_fort.f90
 module d3_fort

   use d3params, only: sp, dp, qp, Bohr, Hartree, k1, k2, k3, alp, max_elem, &
      max_cn, numcn, cntab, r2r4a, rcova, r0

   implicit none

   private
   public d3_calc

   contains

      subroutine d3_calc(natoms, atomnumber, cell, xyz, rcutin, rcutcn, s6, s18, &
            rs6, rs18, alp6, alp8, pbc, bj, threebody, energy, forces, stress)

         use d3params, only: initialize_c6, cross_prod, outer_prod

         implicit none
         !f2py threadsafe

         integer, intent(in) :: natoms, atomnumber(natoms)

         real(dp), intent(in) :: cell(3, 3), xyz(3, natoms), rcutin, rcutcn
         real(dp), intent(in) :: s6, s18, rs6, rs18, alp6, alp8

         logical, intent(in) :: pbc(3), bj, threebody
         
         integer :: a, b, c, i, j, k, ii, jj, kk
         integer :: na, nb, nc, ncna, ncnb
         integer :: elemab, elemac, elembc
         integer :: images_2b(3), images_3b(3)
         integer :: nthreads, myid, start, start_3b
         integer, external :: OMP_GET_NUM_THREADS, OMP_GET_THREAD_NUM

         real(dp) :: rcut
         real(dp) :: tvec(3), tvec_3b(3), rcut2, rcutcn2, rcovab
         real(dp) :: cn(natoms)
         real(dp) :: cnexp, cnab, dcnab(3)
         real(dp) :: lij, lijsum, dlij(3, natoms), slij(3, 3)
         real(dp) :: xyza(3), xyzb(3), xyzc(3), xyzab(3), xyzac(3), xyzbc(3)
         real(dp) :: uxyzab(3), uxyzac(3), uxyzbc(3)
         real(dp) :: self, lattlen(3), V
         real(dp) :: c8, c9, e6, e8, e9, f6ab(3), f8ab(3)
         real(dp) :: a1, a2, dmp
         real(dp) :: rab, rab2, rab3, rab6, rab8
         real(dp) :: rac, rac2, rac3
         real(dp) :: rbc, rbc2, rbc3
         real(dp) :: dmp6, dmp8, rav6, rav8
         real(dp) :: dmp9, ddmp9, dadmp9(3), dbdmp9(3), dcdmp9(3)
         real(dp) :: fa(3), fb(3), fc(3)
         real(dp) :: r9, dar9(3), dbr9(3), dcr9(3)
         real(dp) :: fa9(3), fb9(3), fc9(3)
         real(dp) :: angles, daangles(3), dbangles(3), dcangles(3)
         real(dp) :: rav9, darav9(3), dbrav9(3), dcrav9(3)
         real(dp) :: c6ab, c6ac, c6bc
         real(dp) :: alph, beta, gamm
         real(dp) :: daalph(3), dabeta(3), dagamm(3)
         real(dp) :: dbalph(3), dbbeta(3), dbgamm(3)
         real(dp) :: dcalph(3), dcbeta(3), dcgamm(3)
         real(dp) :: sc6ab(3, 3), sc6ac(3, 3), sc6bc(3, 3), sc8(3, 3)
         real(dp) :: sc9(3, 3)
         real(dp) :: dc6ab(3, natoms), dc6ac(3, natoms), dc6bc(3, natoms)
         real(dp) :: dc8(3, natoms), dc9(3, natoms)
         real(dp) :: sc_tmp(3, 3), c6abij
         real(dp) :: c6(natoms, natoms), dc6(3, natoms, natoms, natoms)
         real(dp) :: sc6(3, 3, natoms, natoms)
         real(dp) :: r0ab, r0ac, r0bc

         logical :: central

         real(dp), allocatable :: dcn(:, :, :), scn(:, :, :)
         real(dp), allocatable :: c6_ref(:,  :, :)
         real(qp), allocatable :: cn_t(:, :), dcn_t(:, :, :, :)
         real(qp), allocatable :: scn_t(:, :, :, :), e_t(:), f_t(:, :, :)
         real(qp), allocatable :: s_t(:, :, :)

         real(dp), intent(out) :: energy, forces(3, natoms), stress(3, 3)

         if (rcutcn > rcutin) then
            rcut = rcutcn
         else
            rcut = rcutin
         endif

         if (bj) then
            a1 = rs6
            a2 = rs18 * Bohr
         endif

         rcut2 = rcut**2
         rcutcn2 = rcutcn**2

         do i = 1, 3
         lattlen(i) = sqrt(dot_product(cell(:, i), cell(:, i)))
         enddo

         V = dot_product(cell(:, 1), cross_prod(cell(:, 2), cell(:, 3)))

         ! Number of atoms within 2body cutoff
         images_2b = ceiling(rcut * product(lattlen) / (V * lattlen))
         do i = 1, 3
         if (.not. pbc(i)) images_2b(i) = 0
         enddo
         
         ! Number of atoms within 3body cutoff
         images_3b = ceiling(rcutcn * product(lattlen) / (V * lattlen))
         do i = 1, 3
         if (.not. pbc(i)) images_3b(i) = 0
         enddo

         ! Calculate number of 2body and 3body interactions, and calculate
         ! cn & dcn
         allocate(dcn(3, natoms, natoms))
         allocate(scn(3, 3, natoms))
         cn = 0
         dcn = 0
         scn = 0

 !$OMP PARALLEL default(private) shared(cell, images_3b) &
 !$OMP shared(atomnumber, rcutcn2, natoms, xyz) &
 !$OMP shared(cn_t, dcn_t, scn_t)
         nthreads = OMP_GET_NUM_THREADS()
         myid = OMP_GET_THREAD_NUM() + 1
         if (myid == 1) then
            allocate(cn_t(natoms, nthreads))
            allocate(dcn_t(3, natoms, natoms, nthreads))
            allocate(scn_t(3, 3, natoms, nthreads))
            cn_t = 0
            dcn_t = 0
            scn_t = 0
         endif

 !$OMP BARRIER
 !$OMP DO schedule(dynamic, natoms) collapse(4)
         do a = 1, natoms
         do i = -images_3b(1), images_3b(1)
         do j = -images_3b(2), images_3b(2)
         do k = -images_3b(3), images_3b(3)

         xyza = xyz(:, a)
         na = atomnumber(a)

         if ((i == 0) .and. (j == 0) .and. (k == 0)) then
            central = .true.
            tvec = 0
            start = a + 1
         else
            central = .false.
            tvec = matmul((/i, j, k/), cell)
            start = 1
         endif

         do b = start, natoms

         xyzb = xyz(:, b) + tvec
         nb = atomnumber(b)
         
         xyzab = xyzb - xyza

         rab2 = dot_product(xyzab, xyzab)
         
         if (rab2 < rcutcn2) then
            rab = sqrt(rab2)
            uxyzab = xyzab / rab
            rcovab = rcova(na) + rcova(nb)
            cnexp = exp(-k1 * (rcovab / rab - 1.0_dp))
            cnab = 1.0_dp / (1.0_dp + cnexp)
            dcnab = cnexp * k1 * rcovab * cnab**2 * uxyzab / rab2
            cn_t(a, myid) = cn_t(a, myid) + cnab
            if (central) cn_t(b, myid) = cn_t(b, myid) + cnab
            if (b > a) then
               dcn_t(:, b, a, myid) = dcn_t(:, b, a, myid) - dcnab
               dcn_t(:, a, a, myid) = dcn_t(:, a, a, myid) + dcnab
               dcn_t(:, a, b, myid) = dcn_t(:, a, b, myid) + dcnab
               dcn_t(:, b, b, myid) = dcn_t(:, b, b, myid) - dcnab
            endif
            sc_tmp = spread(xyzab, 1, 3) * spread(dcnab, 2, 3)
            scn_t(:, :, a, myid) = scn_t(:, :, a, myid) + sc_tmp
            if (central) scn_t(:, :, b, myid) = scn_t(:, :, b, myid) + sc_tmp
         endif
         enddo
         enddo
         enddo
         enddo
         enddo
 !$OMP END DO
 !$OMP END PARALLEL

         cn = sum(cn_t, 2)
         dcn = sum(dcn_t, 4)
         scn = sum(scn_t, 4)
         deallocate(cn_t)
         deallocate(dcn_t)
         deallocate(scn_t)

         allocate(c6_ref(max_cn, max_cn, (max_elem + 1) * max_elem / 2))
         call initialize_c6(c6_ref)

 !$OMP PARALLEL default(private) shared(natoms, atomnumber, numcn, cn, cntab) &
 !$OMP shared(dcn, scn, c6_ref, c6, dc6, sc6)
 !$OMP DO schedule(dynamic, natoms) collapse(2)
         do a = 1, natoms
         do b = 1, natoms
         na = atomnumber(a)
         nb = atomnumber(b)
         ncna = numcn(na)
         ncnb = numcn(nb)

         if (na < nb) then
            elemab = max_elem * (na - 1) - (na * (na - 1))/2 + nb
         else
            elemab = max_elem * (nb - 1) - (nb * (nb - 1))/2 + na
         endif

         if ((ncna == 1) .and. (ncnb == 1)) then
            c6(b, a) = c6_ref(1, 1, elemab)
            dc6(:, :, b, a) = 0
            sc6(:, :, b, a) = 0
         else
            lijsum = 0
            dlij = 0
            slij = 0
            c6ab = 0
            dc6ab = 0
            sc6ab = 0
            do i = 1, ncna
            do j = 1, ncnb
            if (na < nb) then
               c6abij = c6_ref(i, j, elemab)
            else
               c6abij = c6_ref(j, i, elemab)
            endif
            lij = exp(-k3 * ((cn(a) - cntab(i, na))**2 + (cn(b) - cntab(j, nb))**2))
            dlij = dlij - 2.0_dp * k3 * lij &
               * ((cn(a) - cntab(i, na)) * dcn(:, :, a) &
               + (cn(b) - cntab(j, nb)) * dcn(:, :, b))
            slij = slij - 2.0_dp * k3 * lij &
               * ((cn(a) - cntab(i, na)) * scn(:, :, a) &
               + (cn(b) - cntab(j, nb)) * scn(:, :, b))
            lijsum = lijsum + lij
            c6ab = c6ab + c6abij * lij
            dc6ab = dc6ab - c6abij &
               * 2.0_dp * k3 * lij &
               * ((cn(a) - cntab(i, na)) * dcn(:, :, a) &
               + (cn(b) - cntab(j, nb)) * dcn(:, :, b))
            sc6ab = sc6ab - c6abij &
               * 2.0_dp * k3 * lij &
               * ((cn(a) - cntab(i, na)) * scn(:, :, a) &
               + (cn(b) - cntab(j, nb)) * scn(:, :, b))
            enddo
            enddo

            c6ab = c6ab / lijsum
            c6(b, a) = c6ab
            dc6(:, :, b, a) = (dc6ab - c6ab * dlij) / lijsum
            sc6(:, :, b, a) = (sc6ab - c6ab * slij) / lijsum
         endif
         enddo
         enddo
 !$OMP END DO
 !$OMP END PARALLEL

         deallocate(c6_ref)
         deallocate(dcn)
         deallocate(scn)

         energy = 0
         forces = 0
         stress = 0

 !$OMP PARALLEL default(private) shared(natoms, xyz, atomnumber, cell) &
 !$OMP shared(rcut2, rcutcn2, cn, cntab, c6, dc6, sc6) &
 !$OMP shared(r2r4a, a1, a2, rs6, alp6, rs18, alp8, energy, forces, stress) &
 !$OMP shared(bj, threebody, s6, s18, images_2b, images_3b) &
 !$OMP shared(e_t, f_t, s_t)
         nthreads = OMP_GET_NUM_THREADS()
         myid = OMP_GET_THREAD_NUM() + 1
         if (myid == 1) then
            allocate(e_t(nthreads))
            allocate(f_t(3, natoms, nthreads))
            allocate(s_t(3, 3, nthreads))
         endif

 !$OMP BARRIER
         e_t(myid) = 0
         f_t(:, :, myid) = 0
         s_t(:, :, myid) = 0

 !$OMP DO schedule(dynamic,natoms) collapse(4)
         do a = 1, natoms
         do i = -images_2b(1), images_2b(1)
         do j = -images_2b(2), images_2b(2)
         do k = -images_2b(3), images_2b(3)

         xyza = xyz(:, a)
         na = atomnumber(a)

         if ((i == 0) .and. (j == 0) .and. (k == 0)) then
            tvec = 0
            start = a + 1
         else
            tvec = matmul((/i, j, k/), cell)
            start = a
         endif

         do b = start, natoms
         nb = atomnumber(b)
         xyzb = xyz(:, b) + tvec

         xyzab = xyzb - xyza
         rab2 = dot_product(xyzab, xyzab)
         if (rab2 > rcut2) cycle

         rab = sqrt(rab2)
         rab2 = rab**2
         rab6 = rab2**3
         rab8 = rab6 * rab2
         uxyzab = xyzab / rab

         if (a == b) then
            self = 0.5_dp
         else
            self = 1.0_dp
         endif

         if (na < nb) then
            elemab = max_elem * (na - 1) - (na * (na - 1))/2 + nb
         else
            elemab = max_elem * (nb - 1) - (nb * (nb - 1))/2 + na
         endif
         c6ab = c6(b, a)
         dc6ab = dc6(:, :, b, a)
         sc6ab = sc6(:, :, b, a)
         r0ab = r0(elemab)

         e6 = 0
         e8 = 0

         c8 = 3.0_dp * c6ab * r2r4a(na) * r2r4a(nb)
         dc8 = 3.0_dp * dc6ab * r2r4a(na) * r2r4a(nb)
         sc8 = 3.0_dp * sc6ab * r2r4a(na) * r2r4a(nb)

         if (bj) then
            dmp = a1 * sqrt(3.0_dp * r2r4a(na) * r2r4a(nb)) + a2
            e6 = -1.0_dp / (rab6 + dmp**6)
            f6ab = -6.0_dp * e6**2 * rab2**2 * xyzab

            e8 = -1.0_dp / (rab8 + dmp**8)
            f8ab = -8.0_dp * e8**2 * rab6 * xyzab
         else
            rav6 = (rs6 * r0ab / rab)**alp6
            dmp6 = 1.0_dp / (1.0_dp + 6.0_dp * rav6)
            e6 = -dmp6 / rab6
            f6ab = 6.0_dp * (xyzab * e6 / rab2 &
               + dmp6**2 * xyzab * alp6 * rav6 / rab8)

            rav8 = (rs18 * r0ab / rab)**alp8
            dmp8 = 1.0_dp / (1.0_dp + 6.0_dp * rav8)
            e8 = -dmp8 / rab8
            f8ab = 8.0_dp * (xyzab * e8 / rab2 &
               + dmp8**2 * xyzab * alp8 * rav8 / (rab2 * rab8))
         endif

         e6 = e6 * s6 * self
         e8 = e8 * s18 * self

         f6ab = c6ab * f6ab * s6 * self
         f8ab = c8 * f8ab * s18 * self

         e_t(myid) = e_t(myid) + c6ab * e6 + c8 * e8

         if (a /= b) then
            f_t(:, a, myid) = f_t(:, a, myid) - f6ab - f8ab
            f_t(:, b, myid) = f_t(:, b, myid) + f6ab + f8ab
         endif
         f_t(:, :, myid) = f_t(:, :, myid) - dc6ab * e6 - dc8 * e8

         s_t(:, :, myid) = s_t(:, :, myid) + sc6ab * e6 + sc8 * e8 &
            + outer_prod(3, xyzab, f6ab + f8ab)
         
         if (threebody .and. (rab2 < rcutcn2)) then
            do ii = -images_3b(1), images_3b(1)
            do jj = -images_3b(2), images_3b(2)
            do kk = -images_3b(3), images_3b(3)

            if ((ii == 0) .and. (jj == 0) .and. (kk == 0)) then
               central = .true.
               tvec_3b = 0
            else
               central = .false.
               tvec_3b = matmul((/ii, jj, kk/), cell)
            endif

            if ((i == ii) .and. (j == jj) .and. (k == kk)) then
               start_3b = b + 1
            else
               start_3b = b
            endif


            do c = start_3b, natoms
            if ((central) .and. (c == a)) cycle
            xyzc = xyz(:, c) + tvec_3b
            xyzac = xyzc - xyza
            rac2 = dot_product(xyzac, xyzac)
            if (rac2 > rcutcn2) cycle
            xyzbc = xyzc - xyzb
            rbc2 = dot_product(xyzbc, xyzbc)
            if (rbc2 > rcutcn2) cycle

            rab3 = rab2 * rab
            rac = sqrt(rac2)
            rac3 = rac2 * rac
            rbc = sqrt(rbc2)
            rbc3 = rbc2 * rbc

            nc = atomnumber(c)
   
            if (na < nc) then
               elemac = max_elem * (na - 1) - na * (na - 1)/2 + nc
            else
               elemac = max_elem * (nc - 1) - nc * (nc - 1)/2 + na
            endif
   
            if (nb < nc) then
               elembc = max_elem * (nb - 1) - nb * (nb - 1)/2 + nc
            else
               elembc = max_elem * (nc - 1) - nc * (nc - 1)/2 + nb
            endif

            r0ac = r0(elemac)
            r0bc = r0(elembc)

            c6ac = c6(c, a)
            dc6ac = dc6(:, :, c, a)
            sc6ac = sc6(:, :, c, a)

            c6bc = c6(c, b)
            dc6bc = dc6(:, :, c, b)
            sc6bc = sc6(:, :, c, b)

            if ((a == b) .or. (a == c) .or. (b == c)) then
               if ((a == b) .and. (a == c)) then
                  self = 1.0_dp / 6.0_dp
               else
                  self = 1.0_dp / 2.0_dp
               endif
            else
               self = 1.0_dp
            endif

            alph = dot_product(xyzab, xyzac) / (rab * rac)
            beta = -dot_product(xyzab, xyzbc) / (rab * rbc)
            gamm = dot_product(xyzac, xyzbc) / (rac * rbc)
   
            ! Gradient of alph, beta, gamm. Very complicated...
            ! Figured this all out using Mathematica and defining
            ! alph = dot_product(xyzab,xyzac)/(rab * rac), etc.
            daalph = alph * (xyzac / rac2 + xyzab / rab2) &
               - (xyzac + xyzab) / (rab * rac)
            dabeta = xyzbc / (rab * rbc) + xyzab * beta / rab2
            dagamm = -xyzbc / (rac * rbc) + xyzac * gamm / rac2
   
            dbalph = xyzac / (rab * rac) - xyzab * alph / rab2
            dbbeta = beta * (xyzbc / rbc2 - xyzab / rab2) &
               + (xyzab - xyzbc) / (rab * rbc)
            dbgamm = -xyzac / (rac * rbc) + xyzbc * gamm / rbc2
   
            dcalph = xyzab / (rab * rac) - xyzac * alph / rac2
            dcbeta = -xyzab / (rab * rbc) - xyzbc * beta / rbc2
            dcgamm = -gamm * (xyzac / rac2 + xyzbc / rbc2) &
               + (xyzac + xyzbc) / (rac * rbc)

            ! I have no idea what 'rav' stands for, but that's what Grimme
            ! called this variable.  Cube root of the product of the
            ! ratios of r0ab/rab, times 4/3 for some reason. I don't know.
            rav9 = (4.0_dp/3.0_dp) * (r0ab * r0ac * r0bc &
               / (rab * rbc * rac))**(1.0_dp/3.0_dp)
            darav9 = (rav9/3.0_dp) * (xyzab / rab2 + xyzac / rac2)
            dbrav9 = (rav9/3.0_dp) * (-xyzab / rab2 + xyzbc / rbc2)
            dcrav9 = -(rav9/3.0_dp) * (xyzac / rac2 + xyzbc / rbc2)
   
            ! Three-body term *always* uses "zero" damping, even if
            ! we are using the BJ version of DFT-D3
            dmp9 = 1.0_dp/(1.0_dp + 6.0_dp * rav9**alp8)
            ddmp9 = -6.0_dp * alp8 * rav9**(alp8-1) * dmp9**2
   
            ! Three-body depends on "average" r^9
            r9 = 1.0_dp / (rab3 * rac3 * rbc3)
            dar9 = 3.0_dp * r9 * (xyzab / rab2 + xyzac / rac2)
            dbr9 = 3.0_dp * r9 * (-xyzab / rab2 + xyzbc / rbc2)
            dcr9 = -3.0_dp * r9 * (xyzac / rac2 + xyzbc / rbc2)
   
            ! Angle term of the three body energy, and its gradient
            angles = 3.0_dp * alph * beta * gamm + 1.0_dp
            daangles = 3.0_dp * (daalph * beta * gamm &
               + alph * dabeta * gamm &
               + alph * beta * dagamm)
            dbangles = 3.0_dp * (dbalph * beta * gamm &
               + alph * dbbeta * gamm &
               + alph * beta * dbgamm)
            dcangles = 3.0_dp * (dcalph * beta * gamm &
               + alph * dcbeta * gamm &
               + alph * beta * dcgamm)
   
            ! Damping derivatives
            dadmp9 = ddmp9 * darav9
            dbdmp9 = ddmp9 * dbrav9
            dcdmp9 = ddmp9 * dcrav9
   
            ! Three-body energy
            e9 = -angles * dmp9 * r9 * self

            c9 = -sqrt(c6ab * c6ac * c6bc / Hartree)

            ! Forces
            fa9 = c9 * (-daangles * dmp9 * r9 &
               - angles * dadmp9 * r9 &
               - angles * dmp9 * dar9) * self
   
            fb9 = c9 * (-dbangles * dmp9 * r9 &
               - angles * dbdmp9 * r9 &
               - angles * dmp9 * dbr9) * self
   
            fc9 = c9 * (-dcangles * dmp9 * r9 &
               - angles * dcdmp9 * r9 &
               - angles * dmp9 * dcr9) * self


            dc9 = (dc6ab * c6ac * c6bc &
               + c6ab * dc6ac * c6bc &
               + c6ab * c6ac * dc6bc) &
               / (2 * Hartree * c9)

            sc9 = (sc6ab * c6ac * c6bc &
               + c6ab * sc6ac * c6bc &
               + c6ab * c6ac * sc6bc) &
               / (2 * Hartree * c9)

            e_t(myid) = e_t(myid) + c9 * e9

            f_t(:, a, myid) = f_t(:, a, myid) - fa9
            f_t(:, b, myid) = f_t(:, b, myid) - fb9
            f_t(:, c, myid) = f_t(:, c, myid) - fc9
            f_t(:, :, myid) = f_t(:, :, myid) - dc9 * e9

            s_t(:, :, myid) = s_t(:, :, myid) + sc9 * e9 &
               - spread(xyza, 1, 3) * spread(fa9, 2, 3) &
               - spread(xyzb, 1, 3) * spread(fb9, 2, 3) &
               - spread(xyzc, 1, 3) * spread(fc9, 2, 3)
            enddo
            enddo
            enddo
            enddo
         endif
         enddo
         enddo
         enddo
         enddo
         enddo
 !$OMP END DO
 !$OMP END PARALLEL
         energy = sum(e_t)
         forces = sum(f_t, 3)
         stress = sum(s_t, 3)
         deallocate(e_t)
         deallocate(f_t)
         deallocate(s_t)

         stress = -(stress + transpose(stress)) / (2.0_dp * V)
      end subroutine d3_calc
 end module d3_fort
	module d3_fort

	use d3params, only: sp, dp, qp, Bohr, Hartree, k1, k2, k3, alp, max_elem, &
	max_cn, numcn, cntab, r2r4a, rcova, r0

	implicit none

	private
	public d3_calc

	contains

	subroutine d3_calc(natoms, atomnumber, cell, xyz, rcutin, rcutcn, s6, s18, &
	rs6, rs18, alp6, alp8, pbc, bj, threebody, energy, forces, stress)

	use d3params, only: initialize_c6, cross_prod, outer_prod

	implicit none
	!f2py threadsafe

	integer, intent(in) :: natoms, atomnumber(natoms)

	real(dp), intent(in) :: cell(3, 3), xyz(3, natoms), rcutin, rcutcn
	real(dp), intent(in) :: s6, s18, rs6, rs18, alp6, alp8

	logical, intent(in) :: pbc(3), bj, threebody

	integer :: a, b, c, i, j, k, ii, jj, kk
	integer :: na, nb, nc, ncna, ncnb
	integer :: elemab, elemac, elembc
	integer :: images_2b(3), images_3b(3)
	integer :: nthreads, myid, start, start_3b
	integer, external :: OMP_GET_NUM_THREADS, OMP_GET_THREAD_NUM

	real(dp) :: rcut
	real(dp) :: tvec(3), tvec_3b(3), rcut2, rcutcn2, rcovab
	real(dp) :: cn(natoms)
	real(dp) :: cnexp, cnab, dcnab(3)
	real(dp) :: lij, lijsum, dlij(3, natoms), slij(3, 3)
	real(dp) :: xyza(3), xyzb(3), xyzc(3), xyzab(3), xyzac(3), xyzbc(3)
	real(dp) :: uxyzab(3), uxyzac(3), uxyzbc(3)
	real(dp) :: self, lattlen(3), V
	real(dp) :: c8, c9, e6, e8, e9, f6ab(3), f8ab(3)
	real(dp) :: a1, a2, dmp
	real(dp) :: rab, rab2, rab3, rab6, rab8
	real(dp) :: rac, rac2, rac3
	real(dp) :: rbc, rbc2, rbc3
	real(dp) :: dmp6, dmp8, rav6, rav8
	real(dp) :: dmp9, ddmp9, dadmp9(3), dbdmp9(3), dcdmp9(3)
	real(dp) :: fa(3), fb(3), fc(3)
	real(dp) :: r9, dar9(3), dbr9(3), dcr9(3)
	real(dp) :: fa9(3), fb9(3), fc9(3)
	real(dp) :: angles, daangles(3), dbangles(3), dcangles(3)
	real(dp) :: rav9, darav9(3), dbrav9(3), dcrav9(3)
	real(dp) :: c6ab, c6ac, c6bc
	real(dp) :: alph, beta, gamm
	real(dp) :: daalph(3), dabeta(3), dagamm(3)
	real(dp) :: dbalph(3), dbbeta(3), dbgamm(3)
	real(dp) :: dcalph(3), dcbeta(3), dcgamm(3)
	real(dp) :: sc6ab(3, 3), sc6ac(3, 3), sc6bc(3, 3), sc8(3, 3)
	real(dp) :: sc9(3, 3)
	real(dp) :: dc6ab(3, natoms), dc6ac(3, natoms), dc6bc(3, natoms)
	real(dp) :: dc8(3, natoms), dc9(3, natoms)
	real(dp) :: sc_tmp(3, 3), c6abij
	real(dp) :: c6(natoms, natoms), dc6(3, natoms, natoms, natoms)
	real(dp) :: sc6(3, 3, natoms, natoms)
	real(dp) :: r0ab, r0ac, r0bc

	logical :: central

	real(dp), allocatable :: dcn(:, :, :), scn(:, :, :)
	real(dp), allocatable :: c6_ref(:, :, :)
	real(qp), allocatable :: cn_t(:, :), dcn_t(:, :, :, :)
	real(qp), allocatable :: scn_t(:, :, :, :), e_t(:), f_t(:, :, :)
	real(qp), allocatable :: s_t(:, :, :)

	real(dp), intent(out) :: energy, forces(3, natoms), stress(3, 3)

	if (rcutcn > rcutin) then
	rcut = rcutcn
	else
	rcut = rcutin
	endif

	if (bj) then
	a1 = rs6
	a2 = rs18 * Bohr
	endif

	rcut2 = rcut**2
	rcutcn2 = rcutcn**2

	do i = 1, 3
	lattlen(i) = sqrt(dot_product(cell(:, i), cell(:, i)))
	enddo

	V = dot_product(cell(:, 1), cross_prod(cell(:, 2), cell(:, 3)))

	! Number of atoms within 2body cutoff
	images_2b = ceiling(rcut * product(lattlen) / (V * lattlen))
	do i = 1, 3
	if (.not. pbc(i)) images_2b(i) = 0
	enddo

	! Number of atoms within 3body cutoff
	images_3b = ceiling(rcutcn * product(lattlen) / (V * lattlen))
	do i = 1, 3
	if (.not. pbc(i)) images_3b(i) = 0
	enddo

	! Calculate number of 2body and 3body interactions, and calculate
	! cn & dcn
	allocate(dcn(3, natoms, natoms))
	allocate(scn(3, 3, natoms))
	cn = 0
	dcn = 0
	scn = 0

	!$OMP PARALLEL default(private) shared(cell, images_3b) &
	!$OMP shared(atomnumber, rcutcn2, natoms, xyz) &
	!$OMP shared(cn_t, dcn_t, scn_t)
	nthreads = OMP_GET_NUM_THREADS()
	myid = OMP_GET_THREAD_NUM() + 1
	if (myid == 1) then
	allocate(cn_t(natoms, nthreads))
	allocate(dcn_t(3, natoms, natoms, nthreads))
	allocate(scn_t(3, 3, natoms, nthreads))
	cn_t = 0
	dcn_t = 0
	scn_t = 0
	endif

	!$OMP BARRIER
	!$OMP DO schedule(dynamic, natoms) collapse(4)
	do a = 1, natoms
	do i = -images_3b(1), images_3b(1)
	do j = -images_3b(2), images_3b(2)
	do k = -images_3b(3), images_3b(3)

	xyza = xyz(:, a)
	na = atomnumber(a)

	if ((i == 0) .and. (j == 0) .and. (k == 0)) then
	central = .true.
	tvec = 0
	start = a + 1
	else
	central = .false.
	tvec = matmul((/i, j, k/), cell)
	start = 1
	endif

	do b = start, natoms

	xyzb = xyz(:, b) + tvec
	nb = atomnumber(b)

	xyzab = xyzb - xyza

	rab2 = dot_product(xyzab, xyzab)

	if (rab2 < rcutcn2) then
	rab = sqrt(rab2)
	uxyzab = xyzab / rab
	rcovab = rcova(na) + rcova(nb)
	cnexp = exp(-k1 * (rcovab / rab - 1.0_dp))
	cnab = 1.0_dp / (1.0_dp + cnexp)
	dcnab = cnexp * k1 * rcovab * cnab*2 uxyzab / rab2
	cn_t(a, myid) = cn_t(a, myid) + cnab
	if (central) cn_t(b, myid) = cn_t(b, myid) + cnab
	if (b > a) then
	dcn_t(:, b, a, myid) = dcn_t(:, b, a, myid) - dcnab
	dcn_t(:, a, a, myid) = dcn_t(:, a, a, myid) + dcnab
	dcn_t(:, a, b, myid) = dcn_t(:, a, b, myid) + dcnab
	dcn_t(:, b, b, myid) = dcn_t(:, b, b, myid) - dcnab
	endif
	sc_tmp = spread(xyzab, 1, 3) * spread(dcnab, 2, 3)
	scn_t(:, :, a, myid) = scn_t(:, :, a, myid) + sc_tmp
	if (central) scn_t(:, :, b, myid) = scn_t(:, :, b, myid) + sc_tmp
	endif
	enddo
	enddo
	enddo
	enddo
	enddo
	!$OMP END DO
	!$OMP END PARALLEL

	cn = sum(cn_t, 2)
	dcn = sum(dcn_t, 4)
	scn = sum(scn_t, 4)
	deallocate(cn_t)
	deallocate(dcn_t)
	deallocate(scn_t)

	allocate(c6_ref(max_cn, max_cn, (max_elem + 1) * max_elem / 2))
	call initialize_c6(c6_ref)

	!$OMP PARALLEL default(private) shared(natoms, atomnumber, numcn, cn, cntab) &
	!$OMP shared(dcn, scn, c6_ref, c6, dc6, sc6)
	!$OMP DO schedule(dynamic, natoms) collapse(2)
	do a = 1, natoms
	do b = 1, natoms
	na = atomnumber(a)
	nb = atomnumber(b)
	ncna = numcn(na)
	ncnb = numcn(nb)

	if (na < nb) then
	elemab = max_elem * (na - 1) - (na * (na - 1))/2 + nb
	else
	elemab = max_elem * (nb - 1) - (nb * (nb - 1))/2 + na
	endif

	if ((ncna == 1) .and. (ncnb == 1)) then
	c6(b, a) = c6_ref(1, 1, elemab)
	dc6(:, :, b, a) = 0
	sc6(:, :, b, a) = 0
	else
	lijsum = 0
	dlij = 0
	slij = 0
	c6ab = 0
	dc6ab = 0
	sc6ab = 0
	do i = 1, ncna
	do j = 1, ncnb
	if (na < nb) then
	c6abij = c6_ref(i, j, elemab)
	else
	c6abij = c6_ref(j, i, elemab)
	endif
	lij = exp(-k3 * ((cn(a) - cntab(i, na))2 + (cn(b) - cntab(j, nb))2))
	dlij = dlij - 2.0_dp * k3 * lij &
	* ((cn(a) - cntab(i, na)) * dcn(:, :, a) &
	+ (cn(b) - cntab(j, nb)) * dcn(:, :, b))
	slij = slij - 2.0_dp * k3 * lij &
	* ((cn(a) - cntab(i, na)) * scn(:, :, a) &
	+ (cn(b) - cntab(j, nb)) * scn(:, :, b))
	lijsum = lijsum + lij
	c6ab = c6ab + c6abij * lij
	dc6ab = dc6ab - c6abij &
	* 2.0_dp * k3 * lij &
	* ((cn(a) - cntab(i, na)) * dcn(:, :, a) &
	+ (cn(b) - cntab(j, nb)) * dcn(:, :, b))
	sc6ab = sc6ab - c6abij &
	* 2.0_dp * k3 * lij &
	* ((cn(a) - cntab(i, na)) * scn(:, :, a) &
	+ (cn(b) - cntab(j, nb)) * scn(:, :, b))
	enddo
	enddo

	c6ab = c6ab / lijsum
	c6(b, a) = c6ab
	dc6(:, :, b, a) = (dc6ab - c6ab * dlij) / lijsum
	sc6(:, :, b, a) = (sc6ab - c6ab * slij) / lijsum
	endif
	enddo
	enddo
	!$OMP END DO
	!$OMP END PARALLEL

	deallocate(c6_ref)
	deallocate(dcn)
	deallocate(scn)

	energy = 0
	forces = 0
	stress = 0

	!$OMP PARALLEL default(private) shared(natoms, xyz, atomnumber, cell) &
	!$OMP shared(rcut2, rcutcn2, cn, cntab, c6, dc6, sc6) &
	!$OMP shared(r2r4a, a1, a2, rs6, alp6, rs18, alp8, energy, forces, stress) &
	!$OMP shared(bj, threebody, s6, s18, images_2b, images_3b) &
	!$OMP shared(e_t, f_t, s_t)
	nthreads = OMP_GET_NUM_THREADS()
	myid = OMP_GET_THREAD_NUM() + 1
	if (myid == 1) then
	allocate(e_t(nthreads))
	allocate(f_t(3, natoms, nthreads))
	allocate(s_t(3, 3, nthreads))
	endif

	!$OMP BARRIER
	e_t(myid) = 0
	f_t(:, :, myid) = 0
	s_t(:, :, myid) = 0

	!$OMP DO schedule(dynamic,natoms) collapse(4)
	do a = 1, natoms
	do i = -images_2b(1), images_2b(1)
	do j = -images_2b(2), images_2b(2)
	do k = -images_2b(3), images_2b(3)

	xyza = xyz(:, a)
	na = atomnumber(a)

	if ((i == 0) .and. (j == 0) .and. (k == 0)) then
	tvec = 0
	start = a + 1
	else
	tvec = matmul((/i, j, k/), cell)
	start = a
	endif

	do b = start, natoms
	nb = atomnumber(b)
	xyzb = xyz(:, b) + tvec

	xyzab = xyzb - xyza
	rab2 = dot_product(xyzab, xyzab)
	if (rab2 > rcut2) cycle

	rab = sqrt(rab2)
	rab2 = rab**2
	rab6 = rab2**3
	rab8 = rab6 * rab2
	uxyzab = xyzab / rab

	if (a == b) then
	self = 0.5_dp
	else
	self = 1.0_dp
	endif

	if (na < nb) then
	elemab = max_elem * (na - 1) - (na * (na - 1))/2 + nb
	else
	elemab = max_elem * (nb - 1) - (nb * (nb - 1))/2 + na
	endif
	c6ab = c6(b, a)
	dc6ab = dc6(:, :, b, a)
	sc6ab = sc6(:, :, b, a)
	r0ab = r0(elemab)

	e6 = 0
	e8 = 0

	c8 = 3.0_dp * c6ab * r2r4a(na) * r2r4a(nb)
	dc8 = 3.0_dp * dc6ab * r2r4a(na) * r2r4a(nb)
	sc8 = 3.0_dp * sc6ab * r2r4a(na) * r2r4a(nb)

	if (bj) then
	dmp = a1 * sqrt(3.0_dp * r2r4a(na) * r2r4a(nb)) + a2
	e6 = -1.0_dp / (rab6 + dmp**6)
	f6ab = -6.0_dp * e6*2 rab2*2 xyzab

	e8 = -1.0_dp / (rab8 + dmp**8)
	f8ab = -8.0_dp * e8*2 rab6 * xyzab
	else
	rav6 = (rs6 * r0ab / rab)**alp6
	dmp6 = 1.0_dp / (1.0_dp + 6.0_dp * rav6)
	e6 = -dmp6 / rab6
	f6ab = 6.0_dp * (xyzab * e6 / rab2 &
	+ dmp6*2 xyzab * alp6 * rav6 / rab8)

	rav8 = (rs18 * r0ab / rab)**alp8
	dmp8 = 1.0_dp / (1.0_dp + 6.0_dp * rav8)
	e8 = -dmp8 / rab8
	f8ab = 8.0_dp * (xyzab * e8 / rab2 &
	+ dmp8*2 xyzab * alp8 * rav8 / (rab2 * rab8))
	endif

	e6 = e6 * s6 * self
	e8 = e8 * s18 * self

	f6ab = c6ab * f6ab * s6 * self
	f8ab = c8 * f8ab * s18 * self

	e_t(myid) = e_t(myid) + c6ab * e6 + c8 * e8

	if (a /= b) then
	f_t(:, a, myid) = f_t(:, a, myid) - f6ab - f8ab
	f_t(:, b, myid) = f_t(:, b, myid) + f6ab + f8ab
	endif
	f_t(:, :, myid) = f_t(:, :, myid) - dc6ab * e6 - dc8 * e8

	s_t(:, :, myid) = s_t(:, :, myid) + sc6ab * e6 + sc8 * e8 &
	+ outer_prod(3, xyzab, f6ab + f8ab)

	if (threebody .and. (rab2 < rcutcn2)) then
	do ii = -images_3b(1), images_3b(1)
	do jj = -images_3b(2), images_3b(2)
	do kk = -images_3b(3), images_3b(3)

	if ((ii == 0) .and. (jj == 0) .and. (kk == 0)) then
	central = .true.
	tvec_3b = 0
	else
	central = .false.
	tvec_3b = matmul((/ii, jj, kk/), cell)
	endif

	if ((i == ii) .and. (j == jj) .and. (k == kk)) then
	start_3b = b + 1
	else
	start_3b = b
	endif


	do c = start_3b, natoms
	if ((central) .and. (c == a)) cycle
	xyzc = xyz(:, c) + tvec_3b
	xyzac = xyzc - xyza
	rac2 = dot_product(xyzac, xyzac)
	if (rac2 > rcutcn2) cycle
	xyzbc = xyzc - xyzb
	rbc2 = dot_product(xyzbc, xyzbc)
	if (rbc2 > rcutcn2) cycle

	rab3 = rab2 * rab
	rac = sqrt(rac2)
	rac3 = rac2 * rac
	rbc = sqrt(rbc2)
	rbc3 = rbc2 * rbc

	nc = atomnumber(c)

	if (na < nc) then
	elemac = max_elem * (na - 1) - na * (na - 1)/2 + nc
	else
	elemac = max_elem * (nc - 1) - nc * (nc - 1)/2 + na
	endif

	if (nb < nc) then
	elembc = max_elem * (nb - 1) - nb * (nb - 1)/2 + nc
	else
	elembc = max_elem * (nc - 1) - nc * (nc - 1)/2 + nb
	endif

	r0ac = r0(elemac)
	r0bc = r0(elembc)

	c6ac = c6(c, a)
	dc6ac = dc6(:, :, c, a)
	sc6ac = sc6(:, :, c, a)

	c6bc = c6(c, b)
	dc6bc = dc6(:, :, c, b)
	sc6bc = sc6(:, :, c, b)

	if ((a == b) .or. (a == c) .or. (b == c)) then
	if ((a == b) .and. (a == c)) then
	self = 1.0_dp / 6.0_dp
	else
	self = 1.0_dp / 2.0_dp
	endif
	else
	self = 1.0_dp
	endif

	alph = dot_product(xyzab, xyzac) / (rab * rac)
	beta = -dot_product(xyzab, xyzbc) / (rab * rbc)
	gamm = dot_product(xyzac, xyzbc) / (rac * rbc)

	! Gradient of alph, beta, gamm. Very complicated...
	! Figured this all out using Mathematica and defining
	! alph = dot_product(xyzab,xyzac)/(rab * rac), etc.
	daalph = alph * (xyzac / rac2 + xyzab / rab2) &
	- (xyzac + xyzab) / (rab * rac)
	dabeta = xyzbc / (rab * rbc) + xyzab * beta / rab2
	dagamm = -xyzbc / (rac * rbc) + xyzac * gamm / rac2

	dbalph = xyzac / (rab * rac) - xyzab * alph / rab2
	dbbeta = beta * (xyzbc / rbc2 - xyzab / rab2) &
	+ (xyzab - xyzbc) / (rab * rbc)
	dbgamm = -xyzac / (rac * rbc) + xyzbc * gamm / rbc2

	dcalph = xyzab / (rab * rac) - xyzac * alph / rac2
	dcbeta = -xyzab / (rab * rbc) - xyzbc * beta / rbc2
	dcgamm = -gamm * (xyzac / rac2 + xyzbc / rbc2) &
	+ (xyzac + xyzbc) / (rac * rbc)

	! I have no idea what 'rav' stands for, but that's what Grimme
	! called this variable. Cube root of the product of the
	! ratios of r0ab/rab, times 4/3 for some reason. I don't know.
	rav9 = (4.0_dp/3.0_dp) * (r0ab * r0ac * r0bc &
	/ (rab * rbc * rac))**(1.0_dp/3.0_dp)
	darav9 = (rav9/3.0_dp) * (xyzab / rab2 + xyzac / rac2)
	dbrav9 = (rav9/3.0_dp) * (-xyzab / rab2 + xyzbc / rbc2)
	dcrav9 = -(rav9/3.0_dp) * (xyzac / rac2 + xyzbc / rbc2)

	! Three-body term always uses "zero" damping, even if
	! we are using the BJ version of DFT-D3
	dmp9 = 1.0_dp/(1.0_dp + 6.0_dp * rav9**alp8)
	ddmp9 = -6.0_dp * alp8 * rav9*(alp8-1) dmp9**2

	! Three-body depends on "average" r^9
	r9 = 1.0_dp / (rab3 * rac3 * rbc3)
	dar9 = 3.0_dp * r9 * (xyzab / rab2 + xyzac / rac2)
	dbr9 = 3.0_dp * r9 * (-xyzab / rab2 + xyzbc / rbc2)
	dcr9 = -3.0_dp * r9 * (xyzac / rac2 + xyzbc / rbc2)

	! Angle term of the three body energy, and its gradient
	angles = 3.0_dp * alph * beta * gamm + 1.0_dp
	daangles = 3.0_dp * (daalph * beta * gamm &
	+ alph * dabeta * gamm &
	+ alph * beta * dagamm)
	dbangles = 3.0_dp * (dbalph * beta * gamm &
	+ alph * dbbeta * gamm &
	+ alph * beta * dbgamm)
	dcangles = 3.0_dp * (dcalph * beta * gamm &
	+ alph * dcbeta * gamm &
	+ alph * beta * dcgamm)

	! Damping derivatives
	dadmp9 = ddmp9 * darav9
	dbdmp9 = ddmp9 * dbrav9
	dcdmp9 = ddmp9 * dcrav9

	! Three-body energy
	e9 = -angles * dmp9 * r9 * self

	c9 = -sqrt(c6ab * c6ac * c6bc / Hartree)

	! Forces
	fa9 = c9 * (-daangles * dmp9 * r9 &
	- angles * dadmp9 * r9 &
	- angles * dmp9 * dar9) * self

	fb9 = c9 * (-dbangles * dmp9 * r9 &
	- angles * dbdmp9 * r9 &
	- angles * dmp9 * dbr9) * self

	fc9 = c9 * (-dcangles * dmp9 * r9 &
	- angles * dcdmp9 * r9 &
	- angles * dmp9 * dcr9) * self


	dc9 = (dc6ab * c6ac * c6bc &
	+ c6ab * dc6ac * c6bc &
	+ c6ab * c6ac * dc6bc) &
	/ (2 * Hartree * c9)

	sc9 = (sc6ab * c6ac * c6bc &
	+ c6ab * sc6ac * c6bc &
	+ c6ab * c6ac * sc6bc) &
	/ (2 * Hartree * c9)

	e_t(myid) = e_t(myid) + c9 * e9

	f_t(:, a, myid) = f_t(:, a, myid) - fa9
	f_t(:, b, myid) = f_t(:, b, myid) - fb9
	f_t(:, c, myid) = f_t(:, c, myid) - fc9
	f_t(:, :, myid) = f_t(:, :, myid) - dc9 * e9

	s_t(:, :, myid) = s_t(:, :, myid) + sc9 * e9 &
	- spread(xyza, 1, 3) * spread(fa9, 2, 3) &
	- spread(xyzb, 1, 3) * spread(fb9, 2, 3) &
	- spread(xyzc, 1, 3) * spread(fc9, 2, 3)
	enddo
	enddo
	enddo
	enddo
	endif
	enddo
	enddo
	enddo
	enddo
	enddo
	!$OMP END DO
	!$OMP END PARALLEL
	energy = sum(e_t)
	forces = sum(f_t, 3)
	stress = sum(s_t, 3)
	deallocate(e_t)
	deallocate(f_t)
	deallocate(s_t)

	stress = -(stress + transpose(stress)) / (2.0_dp * V)
	end subroutine d3_calc
	end module d3_fort
No results found