Skip to content

Instantly share code, notes, and snippets.

@Pan-Maciek
Created November 5, 2021 15:56
Show Gist options
  • Save Pan-Maciek/be80360a80758ea12600b2f78699f553 to your computer and use it in GitHub Desktop.
Save Pan-Maciek/be80360a80758ea12600b2f78699f553 to your computer and use it in GitHub Desktop.
#include <assert.h>
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <sys/time.h>
#define IDX(i, j, n) (((j)+ (i)*(n)))
static double gtod_ref_time_sec = 0.0;
/* Adapted from the bl2_clock() routine in the BLIS library */
double dclock(){
double the_time, norm_sec;
struct timeval tv;
gettimeofday( &tv, NULL );
if ( gtod_ref_time_sec == 0.0 )
gtod_ref_time_sec = ( double ) tv.tv_sec;
norm_sec = ( double ) tv.tv_sec - gtod_ref_time_sec;
the_time = norm_sec + tv.tv_usec * 1.0e-6;
return the_time;
}
int chol(double * A, unsigned int n){
int i, j, k;
for (j = 0; j < n; j++) {
for (i = j; i < n; i++) {
for (k = 0; k < j; k++) {
A[IDX(i, j, n)] -= A[IDX(i, k, n)] * A[IDX(j, k, n)];
}
}
if (A[IDX(j, j, n)] < 0.0) {
return (1);
}
A[IDX(j, j, n)] = sqrt(A[IDX(j, j, n)]);
for (i = j + 1; i < n; i++){
A[IDX(i, j, n)] /= A[IDX(j, j, n)];
}
}
return (0);
}
int main(int argc, char ** argv){
double* A;
double t1, t2;
int i, j, n, ret;
double dtime;
n = atoi(argv[1]);
A = malloc(n * n * sizeof(double));
assert(A != NULL);
for (i = 0; i < n; i++) {
A[IDX(i, i, n)] = 1.0;
}
dtime = dclock();
if (chol(A, n)) {
fprintf(stderr,"Matrix is not symmetric or not positive definite\n");
}
dtime = dclock()-dtime;
printf( "Time: %le \n", dtime);
fflush( stdout );
free(A);
return 0;
}
#include <assert.h>
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <sys/time.h>
#define IDX(i, j, n) (((j)+ (i)*(n)))
static double gtod_ref_time_sec = 0.0;
/* Adapted from the bl2_clock() routine in the BLIS library */
double dclock(){
double the_time, norm_sec;
struct timeval tv;
gettimeofday( &tv, NULL );
if ( gtod_ref_time_sec == 0.0 )
gtod_ref_time_sec = ( double ) tv.tv_sec;
norm_sec = ( double ) tv.tv_sec - gtod_ref_time_sec;
the_time = norm_sec + tv.tv_usec * 1.0e-6;
return the_time;
}
int chol(double * A, unsigned int n){
register int i, j, k;
register double tmp;
for (j = 0; j < n; j++) {
for (i = j; i < n; i++) {
tmp = A[IDX(i, j, n)];
for (k = 0; k < j; k++) {
tmp -= A[IDX(i, k, n)] * A[IDX(j, k, n)];
}
A[IDX(i, j, n)] = tmp;
}
if (A[IDX(j, j, n)] < 0.0) {
return (1);
}
A[IDX(j, j, n)] = tmp = sqrt(A[IDX(j, j, n)]);
for (i = j + 1; i < n; i++){
A[IDX(i, j, n)] /= tmp;
}
}
return (0);
}
int main(int argc, char ** argv){
double* A;
double t1, t2;
int i, j, n, ret;
double dtime;
n = atoi(argv[1]);
A = malloc(n * n * sizeof(double));
assert(A != NULL);
for (i = 0; i < n; i++) {
A[IDX(i, i, n)] = 1.0;
}
dtime = dclock();
if (chol(A, n)) {
fprintf(stderr,"Matrix is not symmetric or not positive definite\n");
}
dtime = dclock()-dtime;
printf( "Time: %le \n", dtime);
fflush( stdout );
free(A);
return 0;
}
#include <assert.h>
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <sys/time.h>
#define IDX(i, j, n) (((j)+ (i)*(n)))
#define max(a, b) ((a) > (b) ? (a) : (b))
static double gtod_ref_time_sec = 0.0;
/* Adapted from the bl2_clock() routine in the BLIS library */
double dclock(){
double the_time, norm_sec;
struct timeval tv;
gettimeofday( &tv, NULL );
if ( gtod_ref_time_sec == 0.0 )
gtod_ref_time_sec = ( double ) tv.tv_sec;
norm_sec = ( double ) tv.tv_sec - gtod_ref_time_sec;
the_time = norm_sec + tv.tv_usec * 1.0e-6;
return the_time;
}
int chol(double * A, unsigned int n){
register int i, j, k;
register double tmp;
for (j = 0; j < n; j++) {
for (i = j; i < n; i++) {
tmp = A[IDX(i, j, n)];
for (k = 0; k < j;) {
if (k < max(j - 8, 0)) {
tmp -= A[IDX(i, k+0, n)] * A[IDX(j, k+0, n)];
tmp -= A[IDX(i, k+1, n)] * A[IDX(j, k+1, n)];
tmp -= A[IDX(i, k+2, n)] * A[IDX(j, k+2, n)];
tmp -= A[IDX(i, k+3, n)] * A[IDX(j, k+3, n)];
tmp -= A[IDX(i, k+4, n)] * A[IDX(j, k+4, n)];
tmp -= A[IDX(i, k+5, n)] * A[IDX(j, k+5, n)];
tmp -= A[IDX(i, k+6, n)] * A[IDX(j, k+6, n)];
tmp -= A[IDX(i, k+7, n)] * A[IDX(j, k+7, n)];
k += 8;
} else {
tmp -= A[IDX(i, k, n)] * A[IDX(j, k, n)];
k += 1;
}
}
A[IDX(i, j, n)] = tmp;
}
if (A[IDX(j, j, n)] < 0.0) {
return (1);
}
A[IDX(j, j, n)] = tmp = sqrt(A[IDX(j, j, n)]);
for (i = j + 1; i < n; i++){
A[IDX(i, j, n)] /= tmp;
}
}
return (0);
}
int main(int argc, char ** argv){
double* A;
double t1, t2;
int i, j, n, ret;
double dtime;
n = atoi(argv[1]);
A = malloc(n * n * sizeof(double));
assert(A != NULL);
for (i = 0; i < n; i++) {
A[IDX(i, i, n)] = 1.0;
}
dtime = dclock();
if (chol(A, n)) {
fprintf(stderr,"Matrix is not symmetric or not positive definite\n");
}
dtime = dclock()-dtime;
printf( "Time: %le \n", dtime);
fflush( stdout );
free(A);
return 0;
}
#include <assert.h>
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <sys/time.h>
#include <x86intrin.h>
#define IDX(i, j, n) (((j)+ (i)*(n)))
#define max(a, b) ((a) > (b) ? (a) : (b))
static double gtod_ref_time_sec = 0.0;
/* Adapted from the bl2_clock() routine in the BLIS library */
double dclock(){
double the_time, norm_sec;
struct timeval tv;
gettimeofday( &tv, NULL );
if ( gtod_ref_time_sec == 0.0 )
gtod_ref_time_sec = ( double ) tv.tv_sec;
norm_sec = ( double ) tv.tv_sec - gtod_ref_time_sec;
the_time = norm_sec + tv.tv_usec * 1.0e-6;
return the_time;
}
int chol(double * A, unsigned int n){
register int i, j, k;
register double tmp;
register __m128d tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
for (j = 0; j < n; j++) {
for (i = j; i < n; i++) {
tmp = A[IDX(i, j, n)];
for (k = 0; k < j;) {
if (k < max(j - 8, 0)) {
tmp0 = _mm_load_pd(&A[IDX(i, k, n)]);
tmp1 = _mm_load_pd(&A[IDX(j, k, n)]);
tmp2 = _mm_load_pd(&A[IDX(i, k+2, n)]);
tmp3 = _mm_load_pd(&A[IDX(j, k+2, n)]);
tmp4 = _mm_load_pd(&A[IDX(i, k+4, n)]);
tmp5 = _mm_load_pd(&A[IDX(j, k+4, n)]);
tmp6 = _mm_load_pd(&A[IDX(i, k+6, n)]);
tmp7 = _mm_load_pd(&A[IDX(j, k+6, n)]);
tmp0 = _mm_mul_pd(tmp0, tmp1);
tmp2 = _mm_mul_pd(tmp2, tmp3);
tmp4 = _mm_mul_pd(tmp4, tmp5);
tmp6 = _mm_mul_pd(tmp6, tmp7);
tmp0 = _mm_add_pd(tmp0, tmp2);
tmp0 = _mm_add_pd(tmp0, tmp4);
tmp0 = _mm_add_pd(tmp0, tmp6);
tmp -= tmp0[0] + tmp0[1];
k += 8;
} else {
tmp -= A[IDX(i, k, n)] * A[IDX(j, k, n)];
k += 1;
}
}
A[IDX(i, j, n)] = tmp;
}
if (A[IDX(j, j, n)] < 0.0) {
return (1);
}
A[IDX(j, j, n)] = tmp = sqrt(A[IDX(j, j, n)]);
for (i = j + 1; i < n; i++){
A[IDX(i, j, n)] /= tmp;
}
}
return (0);
}
int main(int argc, char ** argv){
double* A;
double t1, t2;
int i, j, n, ret;
double dtime;
n = atoi(argv[1]);
A = malloc(n * n * sizeof(double));
assert(A != NULL);
for (i = 0; i < n; i++) {
A[IDX(i, i, n)] = 1.0;
}
dtime = dclock();
if (chol(A, n)) {
fprintf(stderr,"Matrix is not symmetric or not positive definite\n");
}
dtime = dclock()-dtime;
printf( "Time: %le \n", dtime);
fflush( stdout );
free(A);
return 0;
}
#include <assert.h>
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <sys/time.h>
#include <x86intrin.h>
#define IDX(i, j, n) (((j)+ (i)*(n)))
#define max(a, b) ((a) > (b) ? (a) : (b))
static double gtod_ref_time_sec = 0.0;
/* Adapted from the bl2_clock() routine in the BLIS library */
double dclock(){
double the_time, norm_sec;
struct timeval tv;
gettimeofday( &tv, NULL );
if ( gtod_ref_time_sec == 0.0 )
gtod_ref_time_sec = ( double ) tv.tv_sec;
norm_sec = ( double ) tv.tv_sec - gtod_ref_time_sec;
the_time = norm_sec + tv.tv_usec * 1.0e-6;
return the_time;
}
int chol(double * A, unsigned int n){
register int i, j, k;
register double tmp;
register __m128d tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
register __m128d tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15;
for (j = 0; j < n; j++) {
for (i = j; i < n; i++) {
tmp = A[IDX(i, j, n)];
for (k = 0; k < j;) {
if (k < max(j - 16, 0)) {
tmp0 = _mm_load_pd(&A[IDX(i, k, n)]);
tmp1 = _mm_load_pd(&A[IDX(j, k, n)]);
tmp2 = _mm_load_pd(&A[IDX(i, k+2, n)]);
tmp3 = _mm_load_pd(&A[IDX(j, k+2, n)]);
tmp4 = _mm_load_pd(&A[IDX(i, k+4, n)]);
tmp5 = _mm_load_pd(&A[IDX(j, k+4, n)]);
tmp6 = _mm_load_pd(&A[IDX(i, k+6, n)]);
tmp7 = _mm_load_pd(&A[IDX(j, k+6, n)]);
tmp8 = _mm_load_pd(&A[IDX(i, k+8, n)]);
tmp9 = _mm_load_pd(&A[IDX(j, k+8, n)]);
tmp10 = _mm_load_pd(&A[IDX(i, k+10, n)]);
tmp11 = _mm_load_pd(&A[IDX(j, k+10, n)]);
tmp12 = _mm_load_pd(&A[IDX(i, k+12, n)]);
tmp13 = _mm_load_pd(&A[IDX(j, k+12, n)]);
tmp14 = _mm_load_pd(&A[IDX(i, k+14, n)]);
tmp15 = _mm_load_pd(&A[IDX(j, k+14, n)]);
tmp0 = _mm_mul_pd(tmp0, tmp1);
tmp2 = _mm_mul_pd(tmp2, tmp3);
tmp4 = _mm_mul_pd(tmp4, tmp5);
tmp6 = _mm_mul_pd(tmp6, tmp7);
tmp8 = _mm_mul_pd(tmp8, tmp9);
tmp10 = _mm_mul_pd(tmp10, tmp11);
tmp12 = _mm_mul_pd(tmp12, tmp13);
tmp14 = _mm_mul_pd(tmp14, tmp15);
tmp0 = _mm_add_pd(tmp0, tmp2);
tmp0 = _mm_add_pd(tmp0, tmp4);
tmp0 = _mm_add_pd(tmp0, tmp6);
tmp0 = _mm_add_pd(tmp0, tmp8);
tmp0 = _mm_add_pd(tmp0, tmp10);
tmp0 = _mm_add_pd(tmp0, tmp12);
tmp0 = _mm_add_pd(tmp0, tmp14);
tmp -= tmp0[0] + tmp0[1];
k += 16;
} else {
tmp -= A[IDX(i, k, n)] * A[IDX(j, k, n)];
k += 1;
}
}
A[IDX(i, j, n)] = tmp;
}
if (A[IDX(j, j, n)] < 0.0) {
return (1);
}
A[IDX(j, j, n)] = tmp = sqrt(A[IDX(j, j, n)]);
for (i = j + 1; i < n; i++){
A[IDX(i, j, n)] /= tmp;
}
}
return (0);
}
int main(int argc, char ** argv){
double* A;
double t1, t2;
int i, j, n, ret;
double dtime;
n = atoi(argv[1]);
A = malloc(n * n * sizeof(double));
assert(A != NULL);
for (i = 0; i < n; i++) {
A[IDX(i, i, n)] = 1.0;
}
dtime = dclock();
if (chol(A, n)) {
fprintf(stderr,"Matrix is not symmetric or not positive definite\n");
}
dtime = dclock()-dtime;
printf( "Time: %le \n", dtime);
fflush( stdout );
free(A);
return 0;
}
#include <assert.h>
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <sys/time.h>
#include <immintrin.h>
#define IDX(i, j, n) (((j)+ (i)*(n)))
#define max(a, b) ((a) > (b) ? (a) : (b))
static double gtod_ref_time_sec = 0.0;
/* Adapted from the bl2_clock() routine in the BLIS library */
double dclock(){
double the_time, norm_sec;
struct timeval tv;
gettimeofday( &tv, NULL );
if ( gtod_ref_time_sec == 0.0 )
gtod_ref_time_sec = ( double ) tv.tv_sec;
norm_sec = ( double ) tv.tv_sec - gtod_ref_time_sec;
the_time = norm_sec + tv.tv_usec * 1.0e-6;
return the_time;
}
int chol(double * A, unsigned int n){
register int i, j, k;
register double tmp;
register __m256d tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
for (j = 0; j < n; j++) {
for (i = j; i < n; i++) {
tmp = A[IDX(i, j, n)];
for (k = 0; k < j;) {
if (k < max(j - 16, 0)) {
tmp0 = _mm256_load_pd(&A[IDX(i, k, n)]);
tmp1 = _mm256_load_pd(&A[IDX(j, k, n)]);
tmp2 = _mm256_load_pd(&A[IDX(i, k+4, n)]);
tmp3 = _mm256_load_pd(&A[IDX(j, k+4, n)]);
tmp4 = _mm256_load_pd(&A[IDX(i, k+8, n)]);
tmp5 = _mm256_load_pd(&A[IDX(j, k+8, n)]);
tmp6 = _mm256_load_pd(&A[IDX(i, k+12, n)]);
tmp7 = _mm256_load_pd(&A[IDX(j, k+12, n)]);
tmp0 = _mm256_mul_pd(tmp0, tmp1);
tmp2 = _mm256_mul_pd(tmp2, tmp3);
tmp4 = _mm256_mul_pd(tmp4, tmp5);
tmp6 = _mm256_mul_pd(tmp6, tmp7);
tmp0 = _mm256_add_pd(tmp0, tmp2);
tmp0 = _mm256_add_pd(tmp0, tmp4);
tmp0 = _mm256_add_pd(tmp0, tmp6);
tmp -= tmp0[0] + tmp0[1] + tmp0[2] + tmp0[3];
k += 16;
} else {
tmp -= A[IDX(i, k, n)] * A[IDX(j, k, n)];
k += 1;
}
}
A[IDX(i, j, n)] = tmp;
}
if (A[IDX(j, j, n)] < 0.0) {
return (1);
}
A[IDX(j, j, n)] = tmp = sqrt(A[IDX(j, j, n)]);
for (i = j + 1; i < n; i++){
A[IDX(i, j, n)] /= tmp;
}
}
return (0);
}
int main(int argc, char ** argv){
double* A;
double t1, t2;
int i, j, n, ret;
double dtime;
n = atoi(argv[1]);
A = aligned_alloc(256,n * n * sizeof(double));
assert(A != NULL);
for (i = 0; i < n; i++) {
A[IDX(i, i, n)] = 1.0;
}
dtime = dclock();
if (chol(A, n)) {
fprintf(stderr,"Matrix is not symmetric or not positive definite\n");
}
dtime = dclock()-dtime;
printf( "Time: %le \n", dtime);
fflush( stdout );
free(A);
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment