Created
February 4, 2016 10:01
-
-
Save perdacherMartin/c1c132a9f8e75320dfc2 to your computer and use it in GitHub Desktop.
AVX Christian
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #include <iostream> | |
| #include <x86intrin.h> | |
| #include <immintrin.h> | |
| #include <stdlib.h> | |
| #include "cholesky.h" | |
| #include <boost/timer/timer.hpp> | |
| using namespace std; | |
| // typedef __m256d T_dvec; | |
| extern "C" void dpotf2_(char* uplo, int* n, double* a, int* lda, int* info); | |
| int main() { | |
| int d = 4096; | |
| double* amatrix = InitDdMatrix(d); | |
| double* bmatrix = (double*) aligned_alloc(ALIGNMENT, d * d * sizeof(double) ); | |
| memcpy(bmatrix,amatrix, d*d*sizeof(double)); | |
| PrintMatrix(d, amatrix); | |
| boost::timer::cpu_timer timer; | |
| choleskyRek(d,amatrix,0,0,2048); | |
| std::cout << "cholesky recursive:" << timer.format() << '\n'; | |
| PrintMatrix(d, amatrix); | |
| free(amatrix); | |
| PrintMatrix(d, bmatrix); | |
| int info=0; | |
| char uplo='U'; | |
| boost::timer::cpu_timer timer2; | |
| dpotf2_(&uplo, &d, bmatrix, &d, &info); | |
| std::cout << "lapack routine:" << timer2.format() << '\n'; | |
| PrintMatrix(d, bmatrix); | |
| free(bmatrix); | |
| std::cout << "Info = " << info << std::endl; | |
| return 0; | |
| } | |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| /* | |
| * cholesky.c | |
| * | |
| * Created on: 28.01.2016 | |
| * Author: martin | |
| */ | |
| #include "cholesky.h" | |
| /** | |
| * creates a diagonal dominant matrix, which is aligned in the memory | |
| * a diagonal dominant matrix is positive definite | |
| */ | |
| double* InitDdMatrix(const size_t N){ | |
| double* matrix = (double*) aligned_alloc(ALIGNMENT, N * N * sizeof(double) ); | |
| double rnumber; | |
| int i,j; | |
| //srand48(); | |
| for ( i = 0 ; i < N ; i++ ){ | |
| rnumber = (double)rand() / (double)RAND_MAX; | |
| matrix[i*N + i] = 5000.0 + 5.0 * rnumber; | |
| } | |
| for ( i = 0 ; i < N ; i++ ){ | |
| for ( j = 0 ; j < i ; j++ ){ | |
| rnumber = (double)rand() / (double)RAND_MAX; | |
| matrix[i*N + j] = rnumber; | |
| matrix[j*N + i] = matrix[i*N + j]; | |
| } | |
| } | |
| return matrix; | |
| } | |
| /* | |
| * creates a small and simple matrix to play around with AVX | |
| */ | |
| double* InitMatrix(const size_t N){ | |
| double* matrix = (double *) aligned_alloc(ALIGNMENT, N * N * sizeof (double) ); | |
| int i,j; | |
| for ( i = 0 ; i < N ; i++ ){ | |
| for ( j = 0 ; j < N ; j++ ){ | |
| matrix[i*N+j] = i*N+j; | |
| } | |
| } | |
| return matrix; | |
| } | |
| /** | |
| * print AVX vector | |
| */ | |
| void PrintVec(double* res){ | |
| printf("%f %f %f %f\n", res[0], res[1], res[2], res[3]); | |
| } | |
| /** | |
| * print matrix | |
| */ | |
| void PrintMatrix(const int d, double *A) { | |
| for (int i=0 ; i<16 ; i++){ | |
| for (int j=0 ; j<16 ; j++) | |
| printf("%05.5f ", A[i*d+j]); | |
| printf("\n") ; | |
| } | |
| for (int i=d-4 ; i<d ; i++){ | |
| printf(" ") ; | |
| for (int j=d-4 ; j<d ; j++) | |
| printf("%05.5f ", A[i*d+j]); | |
| printf("\n") ; | |
| } | |
| printf("\n"); | |
| } | |
| void choleskyRek(int d, double *A, int i, int j, int t) { | |
| if(t<4) { | |
| // T_vec r0 = _mm256_setzero_pd() ; | |
| double dest __attribute__((vector_size(32),aligned(32))); | |
| double * di = (double *) &dest ; | |
| //printf("Triangle %5d %5d\n", i, j) ; | |
| // i == j !!! teilbar durch 4 !!! | |
| T_vec sum0 = _mm256_setzero_pd() ; | |
| T_vec vi ; | |
| for (int k=0 ; k<i ; k+=4) { | |
| vi = _mm256_load_pd(A + d*i + k) ; | |
| sum0 += vi * vi ; | |
| } | |
| _mm256_store_pd(di, sum0) ; | |
| A[i*d+i] = sqrt(A[i*d+i]-di[0]-di[1]-di[2]-di[3]) ; | |
| // ------------ | |
| sum0 = _mm256_setzero_pd() ; | |
| for (int k=0 ; k<i ; k+=4) | |
| sum0 += _mm256_load_pd(A + d*i + k) * _mm256_load_pd(A + d*(i+1) + k) ; | |
| _mm256_store_pd(di, sum0) ; | |
| A[(i+1)*d+i] = (A[(i+1)*d+i]-di[0]-di[1]-di[2]-di[3])/A[i*d+i] ; | |
| // ------------- | |
| sum0 = _mm256_setzero_pd() ; | |
| for (int k=0 ; k<i ; k+=4) { | |
| vi = _mm256_load_pd(A + d*(i+1) + k) ; | |
| sum0 += vi * vi ; | |
| } | |
| _mm256_store_pd(di, sum0) ; | |
| A[(i+1)*d+(i+1)] = sqrt(A[(i+1)*d+(i+1)]-di[0]-di[1]-di[2]-di[3] | |
| -A[d*(i+1)+i]*A[d*(i+1)+i]) ; | |
| // ------------- | |
| sum0 = _mm256_setzero_pd() ; | |
| for (int k=0 ; k<i ; k+=4) { | |
| vi = _mm256_load_pd(A + d*i + k) ; | |
| sum0 += _mm256_hadd_pd( | |
| vi * _mm256_load_pd(A + d*(i+2) + k), | |
| vi * _mm256_load_pd(A + d*(i+3) + k)) ; | |
| } | |
| _mm256_store_pd(di, sum0) ; | |
| A[(i+2)*d+i] = (A[(i+2)*d+i]-di[0]-di[2]) / A[i*d+i] ; | |
| A[(i+3)*d+i] = (A[(i+3)*d+i]-di[1]-di[3]) / A[i*d+i] ; | |
| sum0 = _mm256_setzero_pd() ; | |
| for (int k=0 ; k<i ; k+=4) { | |
| vi = _mm256_load_pd(A + d*(i+1) + k) ; | |
| sum0 += _mm256_hadd_pd( | |
| vi * _mm256_load_pd(A + d*(i+2) + k), | |
| vi * _mm256_load_pd(A + d*(i+3) + k)) ; | |
| } | |
| _mm256_store_pd(di, sum0) ; | |
| A[(i+2)*d+i+1] = (A[(i+2)*d+i+1]-di[0]-di[2]-A[d*(i+1)+i]*A[d*(i+2)+i]) / A[(i+1)*d+i+1] ; | |
| A[(i+3)*d+i+1] = (A[(i+3)*d+i+1]-di[1]-di[3]-A[d*(i+1)+i]*A[d*(i+3)+i]) / A[(i+1)*d+i+1] ; | |
| // ------------- | |
| sum0 = _mm256_setzero_pd() ; | |
| for (int k=0 ; k<i ; k+=4) { | |
| vi = _mm256_load_pd(A + d*(i+2) + k) ; | |
| sum0 += vi * vi ; | |
| } | |
| _mm256_store_pd(di, sum0) ; | |
| A[(i+2)*d+(i+2)] = sqrt(A[(i+2)*d+(i+2)]-di[0]-di[1]-di[2]-di[3] | |
| -A[d*(i+2)+i]*A[d*(i+2)+i]-A[d*(i+2)+i+1]*A[d*(i+2)+i+1]) ; | |
| // ------------ | |
| sum0 = _mm256_setzero_pd() ; | |
| for (int k=0 ; k<i ; k+=4) | |
| sum0 += _mm256_load_pd(A + d*(i+2) + k) * _mm256_load_pd(A + d*(i+3) + k) ; | |
| _mm256_store_pd(di, sum0) ; | |
| A[(i+3)*d+i+2] = (A[(i+3)*d+i+2]-di[0]-di[1]-di[2]-di[3] | |
| -A[(i+3)*d+i]*A[(i+2)*d+i]-A[(i+3)*d+i+1]*A[(i+2)*d+i+1])/A[(i+2)*d+i+2] ; | |
| // ------------- | |
| sum0 = _mm256_setzero_pd() ; | |
| for (int k=0 ; k<i ; k+=4) { | |
| vi = _mm256_load_pd(A + d*(i+3) + k) ; | |
| sum0 += vi * vi ; | |
| } | |
| _mm256_store_pd(di, sum0) ; | |
| A[(i+3)*d+(i+3)] = sqrt(A[(i+3)*d+(i+3)]-di[0]-di[1]-di[2]-di[3] | |
| -A[d*(i+3)+i]*A[d*(i+3)+i]-A[d*(i+3)+i+1]*A[d*(i+3)+i+1] | |
| -A[d*(i+3)+i+2]*A[d*(i+3)+i+2]) ; | |
| } else { | |
| choleskyRek(d, A, i, j, t/2) ; | |
| //printf("%3d x%4d Block %5d %5d\n",t,t,i+t,j) ; | |
| if (t<=16) | |
| block(d, A, i+t, i+2*t, j, j+t) ; | |
| else { | |
| // #pragma omp parallel num_threads(4) | |
| // #pragma omp for | |
| for(int p=0 ; p<4 ; p++) | |
| block(d, A, i+t+p*t/4, i+(5+p)*t/4, j, j+t) ; | |
| } | |
| choleskyRek(d, A, i+t, j+t, t/2) ; | |
| } | |
| } | |
| void block(const int d, double * A, int imin, int imax, int jmin, int jmax) { | |
| // assert jmax <= imin | |
| // d and (jmax-jmin) are multiples of 4 (A is blank-padded) | |
| double dest __attribute__((vector_size(32),aligned(32))); | |
| double * di = (double *) & dest ; | |
| jmax = (jmax < d ? jmax : d) ; | |
| imax = (imax < d ? imax : d) ; | |
| T_vec r0 = _mm256_setzero_pd() ; | |
| T_vec vj, vi0, vi1, vi2, vi3, sum0, sum1 ; | |
| int k ; | |
| for (int i=imin ; i<imax ; i+=4) { | |
| for (int j=jmin ; j<jmax ; j++) { // j increased 3 times inside loop body | |
| sum0 = _mm256_setzero_pd() ; | |
| sum1 = _mm256_setzero_pd() ; | |
| for (k=0 ; k<=j-4 ; k+=4) { | |
| vj = _mm256_load_pd(A + d*j + k) ; | |
| vi0 = _mm256_load_pd(A + d*i + k) ; | |
| vi1 = _mm256_load_pd(A + d*(i+1) + k); | |
| vi2 = _mm256_load_pd(A + d*(i+2) + k); | |
| vi3 = _mm256_load_pd(A + d*(i+3) + k); | |
| sum0 += _mm256_hadd_pd(vj*vi0, vj*vi1); | |
| sum1 += _mm256_hadd_pd(vj*vi2, vj*vi3); | |
| } | |
| /* | |
| vj = _mm256_load_pd(A + d*j + k) ; | |
| vj = _mm256_blend_pd(r0, vj, 1) ; | |
| vi0 = _mm256_load_pd(A + d*i + k) ; | |
| vi1 = _mm256_load_pd(A + d*(i+1) + k) ; | |
| vi2 = _mm256_load_pd(A + d*(i+2) + k) ; | |
| vi3 = _mm256_load_pd(A + d*(i+3) + k) ; | |
| sum0 += _mm256_hadd_pd(vj*vi0, vj*vi1); | |
| sum1 += _mm256_hadd_pd(vj*vi2, vj*vi3); | |
| */ | |
| di[0] = A[d*i+j] ; | |
| di[1] = A[d*(i+1)+j]; | |
| di[2] = A[d*(i+2)+j]; | |
| di[3] = A[d*(i+3)+j]; | |
| _mm256_store_pd(di, | |
| (_mm256_load_pd(di) | |
| - _mm256_blend_pd(sum0, sum1, 0b1100) | |
| - _mm256_permute2f128_pd(sum0, sum1, 0x21)) | |
| / _mm256_broadcast_sd(A+d*j+j)) ; | |
| A[d*i+j] = di[0] ; | |
| A[d*(i+1)+j] = di[1] ; | |
| A[d*(i+2)+j] = di[2] ; | |
| A[d*(i+3)+j] = di[3] ; | |
| j++ ; | |
| // ---------------------------------- | |
| sum0 = _mm256_setzero_pd() ; | |
| sum1 = _mm256_setzero_pd() ; | |
| for (k=0 ; k<=j-4 ; k+=4) { | |
| vj = _mm256_load_pd(A + d*j + k) ; | |
| vi0 = _mm256_load_pd(A + d*i + k) ; | |
| vi1 = _mm256_load_pd(A + d*(i+1) + k) ; | |
| vi2 = _mm256_load_pd(A + d*(i+2) + k) ; | |
| vi3 = _mm256_load_pd(A + d*(i+3) + k) ; | |
| sum0 += _mm256_hadd_pd(vj*vi0, vj*vi1); | |
| sum1 += _mm256_hadd_pd(vj*vi2, vj*vi3); | |
| } | |
| vj = _mm256_load_pd(A + d*j + k) ; | |
| vj = _mm256_blend_pd(r0, vj, 1) ; | |
| vi0 = _mm256_load_pd(A + d*i + k) ; | |
| vi1 = _mm256_load_pd(A + d*(i+1) + k) ; | |
| vi2 = _mm256_load_pd(A + d*(i+2) + k) ; | |
| vi3 = _mm256_load_pd(A + d*(i+3) + k) ; | |
| sum0 += _mm256_hadd_pd(vj*vi0, vj*vi1); | |
| sum1 += _mm256_hadd_pd(vj*vi2, vj*vi3); | |
| di[0] = A[d*i+j] ; | |
| di[1] = A[d*(i+1)+j] ; | |
| di[2] = A[d*(i+2)+j] ; | |
| di[3] = A[d*(i+3)+j] ; | |
| _mm256_store_pd(di, | |
| (_mm256_load_pd(di) | |
| - _mm256_blend_pd(sum0, sum1, 0b1100) | |
| - _mm256_permute2f128_pd(sum0, sum1, 0x21)) | |
| / _mm256_broadcast_sd(A+d*j+j)) ; | |
| A[d*i+j] = di[0] ; | |
| A[d*(i+1)+j] = di[1] ; | |
| A[d*(i+2)+j] = di[2] ; | |
| A[d*(i+3)+j] = di[3] ; | |
| j++ ; | |
| // ---------------------------------- | |
| sum0 = _mm256_setzero_pd() ; | |
| sum1 = _mm256_setzero_pd() ; | |
| for (k=0 ; k<=j-4 ; k+=4) { | |
| vj = _mm256_load_pd(A + d*j + k) ; | |
| vi0 = _mm256_load_pd(A + d*i + k) ; | |
| vi1 = _mm256_load_pd(A + d*(i+1) + k) ; | |
| vi2 = _mm256_load_pd(A + d*(i+2) + k) ; | |
| vi3 = _mm256_load_pd(A + d*(i+3) + k) ; | |
| sum0 += _mm256_hadd_pd(vj*vi0, vj*vi1); | |
| sum1 += _mm256_hadd_pd(vj*vi2, vj*vi3); | |
| } | |
| vj = _mm256_load_pd(A + d*j + k) ; | |
| vj = _mm256_blend_pd(r0, vj, 3) ; | |
| vi0 = _mm256_load_pd(A + d*i + k) ; | |
| vi1 = _mm256_load_pd(A + d*(i+1) + k) ; | |
| vi2 = _mm256_load_pd(A + d*(i+2) + k) ; | |
| vi3 = _mm256_load_pd(A + d*(i+3) + k) ; | |
| sum0 += _mm256_hadd_pd(vj*vi0, vj*vi1); | |
| sum1 += _mm256_hadd_pd(vj*vi2, vj*vi3); | |
| di[0] = A[d*i+j] ; | |
| di[1] = A[d*(i+1)+j] ; | |
| di[2] = A[d*(i+2)+j] ; | |
| di[3] = A[d*(i+3)+j] ; | |
| _mm256_store_pd(di, | |
| (_mm256_load_pd(di) | |
| - _mm256_blend_pd(sum0, sum1, 0b1100) | |
| - _mm256_permute2f128_pd(sum0, sum1, 0x21)) | |
| / _mm256_broadcast_sd(A+d*j+j)) ; | |
| A[d*i+j] = di[0] ; | |
| A[d*(i+1)+j] = di[1] ; | |
| A[d*(i+2)+j] = di[2] ; | |
| A[d*(i+3)+j] = di[3] ; | |
| j++ ; | |
| // ---------------------------------- | |
| sum0 = _mm256_setzero_pd() ; | |
| sum1 = _mm256_setzero_pd() ; | |
| for (k=0 ; k<=j-4 ; k+=4) { | |
| vj = _mm256_load_pd(A + d*j + k) ; | |
| vi0 = _mm256_load_pd(A + d*i + k) ; | |
| vi1 = _mm256_load_pd(A + d*(i+1) + k) ; | |
| vi2 = _mm256_load_pd(A + d*(i+2) + k) ; | |
| vi3 = _mm256_load_pd(A + d*(i+3) + k) ; | |
| sum0 += _mm256_hadd_pd(vj*vi0, vj*vi1); | |
| sum1 += _mm256_hadd_pd(vj*vi2, vj*vi3); | |
| } | |
| vj = _mm256_load_pd(A + d*j + k) ; | |
| vj = _mm256_blend_pd(r0, vj, 7) ; | |
| vi0 = _mm256_load_pd(A + d*i + k) ; | |
| vi1 = _mm256_load_pd(A + d*(i+1) + k) ; | |
| vi2 = _mm256_load_pd(A + d*(i+2) + k) ; | |
| vi3 = _mm256_load_pd(A + d*(i+3) + k) ; | |
| sum0 += _mm256_hadd_pd(vj*vi0, vj*vi1); | |
| sum1 += _mm256_hadd_pd(vj*vi2, vj*vi3); | |
| di[0] = A[d*i+j] ; | |
| di[1] = A[d*(i+1)+j] ; | |
| di[2] = A[d*(i+2)+j] ; | |
| di[3] = A[d*(i+3)+j] ; | |
| _mm256_store_pd(di, | |
| (_mm256_load_pd(di) | |
| - _mm256_blend_pd(sum0, sum1, 0b1100) | |
| - _mm256_permute2f128_pd(sum0, sum1, 0x21)) | |
| / _mm256_broadcast_sd(A+d*j+j)) ; | |
| A[d*i+j] = di[0] ; | |
| A[d*(i+1)+j] = di[1] ; | |
| A[d*(i+2)+j] = di[2] ; | |
| A[d*(i+3)+j] = di[3] ; | |
| } | |
| } | |
| } | |
| double * mallocA64(size_t s) { | |
| long long adr = (long long) malloc(s + 64); | |
| adr = (adr + 63) & ~63; | |
| return (double *) adr; | |
| } | |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| /* | |
| * cholesky.h | |
| * | |
| * Created on: 28.01.2016 | |
| * Author: martin | |
| */ | |
| #ifndef CHOLESKY_H_ | |
| #define CHOLESKY_H_ | |
| // #define _XOPEN_SOURCE | |
| #ifdef __cplusplus | |
| extern "C" { | |
| #endif | |
| #include <stdlib.h> | |
| #include <x86intrin.h> | |
| #include <stdio.h> | |
| #include <stdlib.h> | |
| #include <math.h> | |
| int const ALIGNMENT = 32; | |
| typedef __m256d T_vec; | |
| double* InitMatrix(const size_t N); | |
| double* InitDdMatrix(const size_t N); | |
| void PrintVec(double* res); | |
| void PrintMatrix(const int d, double *A); | |
| void block(const int d, double * A, int imin, int imax, int jmin, int jmax); | |
| void choleskyRek(int d, double *A, int i, int j, int t); | |
| double * mallocA64(size_t s); | |
| #ifdef __cplusplus | |
| } | |
| #endif | |
| #endif /* CHOLESKY_H_ */ |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment