Skip to content

Instantly share code, notes, and snippets.

@perdacherMartin
Created February 4, 2016 10:01
Show Gist options
  • Select an option

  • Save perdacherMartin/c1c132a9f8e75320dfc2 to your computer and use it in GitHub Desktop.

Select an option

Save perdacherMartin/c1c132a9f8e75320dfc2 to your computer and use it in GitHub Desktop.
AVX Christian
#include <iostream>
#include <x86intrin.h>
#include <immintrin.h>
#include <stdlib.h>
#include "cholesky.h"
#include <boost/timer/timer.hpp>
using namespace std;
// typedef __m256d T_dvec;
extern "C" void dpotf2_(char* uplo, int* n, double* a, int* lda, int* info);
int main() {
int d = 4096;
double* amatrix = InitDdMatrix(d);
double* bmatrix = (double*) aligned_alloc(ALIGNMENT, d * d * sizeof(double) );
memcpy(bmatrix,amatrix, d*d*sizeof(double));
PrintMatrix(d, amatrix);
boost::timer::cpu_timer timer;
choleskyRek(d,amatrix,0,0,2048);
std::cout << "cholesky recursive:" << timer.format() << '\n';
PrintMatrix(d, amatrix);
free(amatrix);
PrintMatrix(d, bmatrix);
int info=0;
char uplo='U';
boost::timer::cpu_timer timer2;
dpotf2_(&uplo, &d, bmatrix, &d, &info);
std::cout << "lapack routine:" << timer2.format() << '\n';
PrintMatrix(d, bmatrix);
free(bmatrix);
std::cout << "Info = " << info << std::endl;
return 0;
}
/*
* cholesky.c
*
* Created on: 28.01.2016
* Author: martin
*/
#include "cholesky.h"
/**
* creates a diagonal dominant matrix, which is aligned in the memory
* a diagonal dominant matrix is positive definite
*/
double* InitDdMatrix(const size_t N){
double* matrix = (double*) aligned_alloc(ALIGNMENT, N * N * sizeof(double) );
double rnumber;
int i,j;
//srand48();
for ( i = 0 ; i < N ; i++ ){
rnumber = (double)rand() / (double)RAND_MAX;
matrix[i*N + i] = 5000.0 + 5.0 * rnumber;
}
for ( i = 0 ; i < N ; i++ ){
for ( j = 0 ; j < i ; j++ ){
rnumber = (double)rand() / (double)RAND_MAX;
matrix[i*N + j] = rnumber;
matrix[j*N + i] = matrix[i*N + j];
}
}
return matrix;
}
/*
* creates a small and simple matrix to play around with AVX
*/
double* InitMatrix(const size_t N){
double* matrix = (double *) aligned_alloc(ALIGNMENT, N * N * sizeof (double) );
int i,j;
for ( i = 0 ; i < N ; i++ ){
for ( j = 0 ; j < N ; j++ ){
matrix[i*N+j] = i*N+j;
}
}
return matrix;
}
/**
* print AVX vector
*/
void PrintVec(double* res){
printf("%f %f %f %f\n", res[0], res[1], res[2], res[3]);
}
/**
* print matrix
*/
void PrintMatrix(const int d, double *A) {
for (int i=0 ; i<16 ; i++){
for (int j=0 ; j<16 ; j++)
printf("%05.5f ", A[i*d+j]);
printf("\n") ;
}
for (int i=d-4 ; i<d ; i++){
printf(" ") ;
for (int j=d-4 ; j<d ; j++)
printf("%05.5f ", A[i*d+j]);
printf("\n") ;
}
printf("\n");
}
void choleskyRek(int d, double *A, int i, int j, int t) {
if(t<4) {
// T_vec r0 = _mm256_setzero_pd() ;
double dest __attribute__((vector_size(32),aligned(32)));
double * di = (double *) &dest ;
//printf("Triangle %5d %5d\n", i, j) ;
// i == j !!! teilbar durch 4 !!!
T_vec sum0 = _mm256_setzero_pd() ;
T_vec vi ;
for (int k=0 ; k<i ; k+=4) {
vi = _mm256_load_pd(A + d*i + k) ;
sum0 += vi * vi ;
}
_mm256_store_pd(di, sum0) ;
A[i*d+i] = sqrt(A[i*d+i]-di[0]-di[1]-di[2]-di[3]) ;
// ------------
sum0 = _mm256_setzero_pd() ;
for (int k=0 ; k<i ; k+=4)
sum0 += _mm256_load_pd(A + d*i + k) * _mm256_load_pd(A + d*(i+1) + k) ;
_mm256_store_pd(di, sum0) ;
A[(i+1)*d+i] = (A[(i+1)*d+i]-di[0]-di[1]-di[2]-di[3])/A[i*d+i] ;
// -------------
sum0 = _mm256_setzero_pd() ;
for (int k=0 ; k<i ; k+=4) {
vi = _mm256_load_pd(A + d*(i+1) + k) ;
sum0 += vi * vi ;
}
_mm256_store_pd(di, sum0) ;
A[(i+1)*d+(i+1)] = sqrt(A[(i+1)*d+(i+1)]-di[0]-di[1]-di[2]-di[3]
-A[d*(i+1)+i]*A[d*(i+1)+i]) ;
// -------------
sum0 = _mm256_setzero_pd() ;
for (int k=0 ; k<i ; k+=4) {
vi = _mm256_load_pd(A + d*i + k) ;
sum0 += _mm256_hadd_pd(
vi * _mm256_load_pd(A + d*(i+2) + k),
vi * _mm256_load_pd(A + d*(i+3) + k)) ;
}
_mm256_store_pd(di, sum0) ;
A[(i+2)*d+i] = (A[(i+2)*d+i]-di[0]-di[2]) / A[i*d+i] ;
A[(i+3)*d+i] = (A[(i+3)*d+i]-di[1]-di[3]) / A[i*d+i] ;
sum0 = _mm256_setzero_pd() ;
for (int k=0 ; k<i ; k+=4) {
vi = _mm256_load_pd(A + d*(i+1) + k) ;
sum0 += _mm256_hadd_pd(
vi * _mm256_load_pd(A + d*(i+2) + k),
vi * _mm256_load_pd(A + d*(i+3) + k)) ;
}
_mm256_store_pd(di, sum0) ;
A[(i+2)*d+i+1] = (A[(i+2)*d+i+1]-di[0]-di[2]-A[d*(i+1)+i]*A[d*(i+2)+i]) / A[(i+1)*d+i+1] ;
A[(i+3)*d+i+1] = (A[(i+3)*d+i+1]-di[1]-di[3]-A[d*(i+1)+i]*A[d*(i+3)+i]) / A[(i+1)*d+i+1] ;
// -------------
sum0 = _mm256_setzero_pd() ;
for (int k=0 ; k<i ; k+=4) {
vi = _mm256_load_pd(A + d*(i+2) + k) ;
sum0 += vi * vi ;
}
_mm256_store_pd(di, sum0) ;
A[(i+2)*d+(i+2)] = sqrt(A[(i+2)*d+(i+2)]-di[0]-di[1]-di[2]-di[3]
-A[d*(i+2)+i]*A[d*(i+2)+i]-A[d*(i+2)+i+1]*A[d*(i+2)+i+1]) ;
// ------------
sum0 = _mm256_setzero_pd() ;
for (int k=0 ; k<i ; k+=4)
sum0 += _mm256_load_pd(A + d*(i+2) + k) * _mm256_load_pd(A + d*(i+3) + k) ;
_mm256_store_pd(di, sum0) ;
A[(i+3)*d+i+2] = (A[(i+3)*d+i+2]-di[0]-di[1]-di[2]-di[3]
-A[(i+3)*d+i]*A[(i+2)*d+i]-A[(i+3)*d+i+1]*A[(i+2)*d+i+1])/A[(i+2)*d+i+2] ;
// -------------
sum0 = _mm256_setzero_pd() ;
for (int k=0 ; k<i ; k+=4) {
vi = _mm256_load_pd(A + d*(i+3) + k) ;
sum0 += vi * vi ;
}
_mm256_store_pd(di, sum0) ;
A[(i+3)*d+(i+3)] = sqrt(A[(i+3)*d+(i+3)]-di[0]-di[1]-di[2]-di[3]
-A[d*(i+3)+i]*A[d*(i+3)+i]-A[d*(i+3)+i+1]*A[d*(i+3)+i+1]
-A[d*(i+3)+i+2]*A[d*(i+3)+i+2]) ;
} else {
choleskyRek(d, A, i, j, t/2) ;
//printf("%3d x%4d Block %5d %5d\n",t,t,i+t,j) ;
if (t<=16)
block(d, A, i+t, i+2*t, j, j+t) ;
else {
// #pragma omp parallel num_threads(4)
// #pragma omp for
for(int p=0 ; p<4 ; p++)
block(d, A, i+t+p*t/4, i+(5+p)*t/4, j, j+t) ;
}
choleskyRek(d, A, i+t, j+t, t/2) ;
}
}
void block(const int d, double * A, int imin, int imax, int jmin, int jmax) {
// assert jmax <= imin
// d and (jmax-jmin) are multiples of 4 (A is blank-padded)
double dest __attribute__((vector_size(32),aligned(32)));
double * di = (double *) & dest ;
jmax = (jmax < d ? jmax : d) ;
imax = (imax < d ? imax : d) ;
T_vec r0 = _mm256_setzero_pd() ;
T_vec vj, vi0, vi1, vi2, vi3, sum0, sum1 ;
int k ;
for (int i=imin ; i<imax ; i+=4) {
for (int j=jmin ; j<jmax ; j++) { // j increased 3 times inside loop body
sum0 = _mm256_setzero_pd() ;
sum1 = _mm256_setzero_pd() ;
for (k=0 ; k<=j-4 ; k+=4) {
vj = _mm256_load_pd(A + d*j + k) ;
vi0 = _mm256_load_pd(A + d*i + k) ;
vi1 = _mm256_load_pd(A + d*(i+1) + k);
vi2 = _mm256_load_pd(A + d*(i+2) + k);
vi3 = _mm256_load_pd(A + d*(i+3) + k);
sum0 += _mm256_hadd_pd(vj*vi0, vj*vi1);
sum1 += _mm256_hadd_pd(vj*vi2, vj*vi3);
}
/*
vj = _mm256_load_pd(A + d*j + k) ;
vj = _mm256_blend_pd(r0, vj, 1) ;
vi0 = _mm256_load_pd(A + d*i + k) ;
vi1 = _mm256_load_pd(A + d*(i+1) + k) ;
vi2 = _mm256_load_pd(A + d*(i+2) + k) ;
vi3 = _mm256_load_pd(A + d*(i+3) + k) ;
sum0 += _mm256_hadd_pd(vj*vi0, vj*vi1);
sum1 += _mm256_hadd_pd(vj*vi2, vj*vi3);
*/
di[0] = A[d*i+j] ;
di[1] = A[d*(i+1)+j];
di[2] = A[d*(i+2)+j];
di[3] = A[d*(i+3)+j];
_mm256_store_pd(di,
(_mm256_load_pd(di)
- _mm256_blend_pd(sum0, sum1, 0b1100)
- _mm256_permute2f128_pd(sum0, sum1, 0x21))
/ _mm256_broadcast_sd(A+d*j+j)) ;
A[d*i+j] = di[0] ;
A[d*(i+1)+j] = di[1] ;
A[d*(i+2)+j] = di[2] ;
A[d*(i+3)+j] = di[3] ;
j++ ;
// ----------------------------------
sum0 = _mm256_setzero_pd() ;
sum1 = _mm256_setzero_pd() ;
for (k=0 ; k<=j-4 ; k+=4) {
vj = _mm256_load_pd(A + d*j + k) ;
vi0 = _mm256_load_pd(A + d*i + k) ;
vi1 = _mm256_load_pd(A + d*(i+1) + k) ;
vi2 = _mm256_load_pd(A + d*(i+2) + k) ;
vi3 = _mm256_load_pd(A + d*(i+3) + k) ;
sum0 += _mm256_hadd_pd(vj*vi0, vj*vi1);
sum1 += _mm256_hadd_pd(vj*vi2, vj*vi3);
}
vj = _mm256_load_pd(A + d*j + k) ;
vj = _mm256_blend_pd(r0, vj, 1) ;
vi0 = _mm256_load_pd(A + d*i + k) ;
vi1 = _mm256_load_pd(A + d*(i+1) + k) ;
vi2 = _mm256_load_pd(A + d*(i+2) + k) ;
vi3 = _mm256_load_pd(A + d*(i+3) + k) ;
sum0 += _mm256_hadd_pd(vj*vi0, vj*vi1);
sum1 += _mm256_hadd_pd(vj*vi2, vj*vi3);
di[0] = A[d*i+j] ;
di[1] = A[d*(i+1)+j] ;
di[2] = A[d*(i+2)+j] ;
di[3] = A[d*(i+3)+j] ;
_mm256_store_pd(di,
(_mm256_load_pd(di)
- _mm256_blend_pd(sum0, sum1, 0b1100)
- _mm256_permute2f128_pd(sum0, sum1, 0x21))
/ _mm256_broadcast_sd(A+d*j+j)) ;
A[d*i+j] = di[0] ;
A[d*(i+1)+j] = di[1] ;
A[d*(i+2)+j] = di[2] ;
A[d*(i+3)+j] = di[3] ;
j++ ;
// ----------------------------------
sum0 = _mm256_setzero_pd() ;
sum1 = _mm256_setzero_pd() ;
for (k=0 ; k<=j-4 ; k+=4) {
vj = _mm256_load_pd(A + d*j + k) ;
vi0 = _mm256_load_pd(A + d*i + k) ;
vi1 = _mm256_load_pd(A + d*(i+1) + k) ;
vi2 = _mm256_load_pd(A + d*(i+2) + k) ;
vi3 = _mm256_load_pd(A + d*(i+3) + k) ;
sum0 += _mm256_hadd_pd(vj*vi0, vj*vi1);
sum1 += _mm256_hadd_pd(vj*vi2, vj*vi3);
}
vj = _mm256_load_pd(A + d*j + k) ;
vj = _mm256_blend_pd(r0, vj, 3) ;
vi0 = _mm256_load_pd(A + d*i + k) ;
vi1 = _mm256_load_pd(A + d*(i+1) + k) ;
vi2 = _mm256_load_pd(A + d*(i+2) + k) ;
vi3 = _mm256_load_pd(A + d*(i+3) + k) ;
sum0 += _mm256_hadd_pd(vj*vi0, vj*vi1);
sum1 += _mm256_hadd_pd(vj*vi2, vj*vi3);
di[0] = A[d*i+j] ;
di[1] = A[d*(i+1)+j] ;
di[2] = A[d*(i+2)+j] ;
di[3] = A[d*(i+3)+j] ;
_mm256_store_pd(di,
(_mm256_load_pd(di)
- _mm256_blend_pd(sum0, sum1, 0b1100)
- _mm256_permute2f128_pd(sum0, sum1, 0x21))
/ _mm256_broadcast_sd(A+d*j+j)) ;
A[d*i+j] = di[0] ;
A[d*(i+1)+j] = di[1] ;
A[d*(i+2)+j] = di[2] ;
A[d*(i+3)+j] = di[3] ;
j++ ;
// ----------------------------------
sum0 = _mm256_setzero_pd() ;
sum1 = _mm256_setzero_pd() ;
for (k=0 ; k<=j-4 ; k+=4) {
vj = _mm256_load_pd(A + d*j + k) ;
vi0 = _mm256_load_pd(A + d*i + k) ;
vi1 = _mm256_load_pd(A + d*(i+1) + k) ;
vi2 = _mm256_load_pd(A + d*(i+2) + k) ;
vi3 = _mm256_load_pd(A + d*(i+3) + k) ;
sum0 += _mm256_hadd_pd(vj*vi0, vj*vi1);
sum1 += _mm256_hadd_pd(vj*vi2, vj*vi3);
}
vj = _mm256_load_pd(A + d*j + k) ;
vj = _mm256_blend_pd(r0, vj, 7) ;
vi0 = _mm256_load_pd(A + d*i + k) ;
vi1 = _mm256_load_pd(A + d*(i+1) + k) ;
vi2 = _mm256_load_pd(A + d*(i+2) + k) ;
vi3 = _mm256_load_pd(A + d*(i+3) + k) ;
sum0 += _mm256_hadd_pd(vj*vi0, vj*vi1);
sum1 += _mm256_hadd_pd(vj*vi2, vj*vi3);
di[0] = A[d*i+j] ;
di[1] = A[d*(i+1)+j] ;
di[2] = A[d*(i+2)+j] ;
di[3] = A[d*(i+3)+j] ;
_mm256_store_pd(di,
(_mm256_load_pd(di)
- _mm256_blend_pd(sum0, sum1, 0b1100)
- _mm256_permute2f128_pd(sum0, sum1, 0x21))
/ _mm256_broadcast_sd(A+d*j+j)) ;
A[d*i+j] = di[0] ;
A[d*(i+1)+j] = di[1] ;
A[d*(i+2)+j] = di[2] ;
A[d*(i+3)+j] = di[3] ;
}
}
}
double * mallocA64(size_t s) {
long long adr = (long long) malloc(s + 64);
adr = (adr + 63) & ~63;
return (double *) adr;
}
/*
* cholesky.h
*
* Created on: 28.01.2016
* Author: martin
*/
#ifndef CHOLESKY_H_
#define CHOLESKY_H_
// #define _XOPEN_SOURCE
#ifdef __cplusplus
extern "C" {
#endif
#include <stdlib.h>
#include <x86intrin.h>
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
int const ALIGNMENT = 32;
typedef __m256d T_vec;
double* InitMatrix(const size_t N);
double* InitDdMatrix(const size_t N);
void PrintVec(double* res);
void PrintMatrix(const int d, double *A);
void block(const int d, double * A, int imin, int imax, int jmin, int jmax);
void choleskyRek(int d, double *A, int i, int j, int t);
double * mallocA64(size_t s);
#ifdef __cplusplus
}
#endif
#endif /* CHOLESKY_H_ */
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment