Skip to content

Instantly share code, notes, and snippets.

@nevikw39
Last active November 27, 2023 14:27
Show Gist options
  • Save nevikw39/471179833ab2ee95a847a96b83e3d036 to your computer and use it in GitHub Desktop.
Save nevikw39/471179833ab2ee95a847a96b83e3d036 to your computer and use it in GitHub Desktop.
Advanced High Performance Computing Cluster Practice HW2
#include <iostream>
#include <random>
using namespace std;
constexpr int N = 1024;
float a[1024][1024], b[1024][1024];
int main()
{
random_device rd;
mt19937 mt(rd());
uniform_real_distribution<float> urd(0, 1);
for (auto &i : a)
for (auto &j : i)
j = urd(mt);
for (auto &i : b)
for (auto &j : i)
j = urd(mt);
FILE *fa = fopen("a.dat", "wb"), *fb = fopen("b.dat", "wb");
fwrite_unlocked(a, sizeof(float), sizeof a / sizeof(float), fa);
fwrite_unlocked(b, sizeof(float), sizeof b / sizeof(float), fb);
fclose(fa);
fclose(fb);
return 0;
}
CC = icc
CFLAGS = -O0 -g -fp-model strict -fsanitize=address -xCORE-AVX512
CXX = icpc
BINS = mat_mul_a_naive mat_mul_b_transpose mat_mul_c_block mat_mul_d_avx
SRCS = $(addsuffix .c,$(BINS))
DATS = a.dat b.dat
.PHONY: all clean distclean
all: $(BINS)
%: %.c | $(DATS)
$(CC) $< -o $@ $(CFLAGS)
$(DATS): | gen
./gen
gen: gen.cpp
$(CXX) $< -o $@
clean:
@-rm $(BINS)
distclean: clean
@-rm *.dat gen
#include <stdio.h>
#include <string.h>
#define N 1024
float a[N][N], b[N][N], c[N][N];
void mat_mul()
{
for (int i = 0; i < N; i++)
for (int j = 0; j < N; j++)
for (int k = 0; k < N; k++)
c[i][j] += a[i][k] * b[k][j];
}
int main()
{
FILE *fa = fopen("a.dat", "rb"), *fb = fopen("b.dat", "rb"), *fc = fopen("c.dat", "wb");
fread_unlocked(a, sizeof(float), sizeof a / sizeof(float), fa);
fread_unlocked(b, sizeof(float), sizeof b / sizeof(float), fb);
fclose(fa);
fclose(fb);
for (int i = 0; i < 8; i++)
{
memset(c, 0, sizeof c);
mat_mul();
}
fwrite_unlocked(c, sizeof(float), sizeof c / sizeof(float), fc);
fclose(fc);
return 0;
}
#include <assert.h>
#include <stdio.h>
#include <string.h>
#define N 1024
float a[N][N], b[N][N], c[N][N], cp[N][N];
void transpose()
{
for (int i = 0; i < N; i++)
for (int j = 0; j < i; j++)
{
float tmp = b[i][j];
b[i][j] = b[j][i];
b[j][i] = tmp;
}
}
void mat_mul()
{
for (int i = 0; i < N; i++)
for (int j = 0; j < N; j++)
for (int k = 0; k < N; k++)
c[i][j] += a[i][k] * b[j][k];
}
int main()
{
FILE *fa = fopen("a.dat", "rb"), *fb = fopen("b.dat", "rb"), *fc = fopen("c.dat", "rb");
fread_unlocked(a, sizeof(float), sizeof a / sizeof(float), fa);
fread_unlocked(b, sizeof(float), sizeof b / sizeof(float), fb);
fread_unlocked(cp, sizeof(float), sizeof cp / sizeof(float), fc);
fclose(fa);
fclose(fb);
fclose(fc);
transpose();
for (int i = 0; i < 8; i++)
{
memset(c, 0, sizeof c);
mat_mul();
}
assert(!memcmp(c, cp, sizeof c));
return 0;
}
#include <assert.h>
#include <stdio.h>
#include <string.h>
#define N 1024
#define BLOCK 8
float a[N][N], b[N][N], c[N][N], cp[N][N];
void transpose()
{
for (int i = 0; i < N; i++)
for (int j = 0; j < i; j++)
{
float tmp = b[i][j];
b[i][j] = b[j][i];
b[j][i] = tmp;
}
}
void mat_mul()
{
for (int bi = 0; bi < N; bi += BLOCK)
for (int bj = 0; bj < N; bj += BLOCK)
for (int bk = 0; bk < N; bk += BLOCK)
for (int i = bi; i < bi + BLOCK; i++)
for (int j = bj; j < bj + BLOCK; j++)
for (int k = bk; k < bk + BLOCK; k++)
c[i][j] += a[i][k] * b[j][k];
}
int main()
{
FILE *fa = fopen("a.dat", "rb"), *fb = fopen("b.dat", "rb"), *fc = fopen("c.dat", "rb");
fread_unlocked(a, sizeof(float), sizeof a / sizeof(float), fa);
fread_unlocked(b, sizeof(float), sizeof b / sizeof(float), fb);
fread_unlocked(cp, sizeof(float), sizeof cp / sizeof(float), fc);
fclose(fa);
fclose(fb);
fclose(fc);
transpose();
for (int i = 0; i < 8; i++)
{
memset(c, 0, sizeof c);
mat_mul();
}
assert(!memcmp(c, cp, sizeof c));
return 0;
}
#include <assert.h>
#include <immintrin.h>
#include <math.h>
#include <stdio.h>
#include <string.h>
#define N 1024
#define BLOCK 8
#define EPS 2.5e-6f
float a[N * N], b[N * N], c[N * N], cp[N][N];
void transpose()
{
for (int i = 0; i < N; i++)
for (int j = 0; j < i; j++)
{
float tmp = b[i * N + j];
b[i * N + j] = b[j * N + i];
b[j * N + i] = tmp;
}
}
static inline float reduce_sum(const __m256 *const x)
{
// static float arr[8];
// _mm256_store_ps(arr, x);
// return *arr + 1[arr] + 2[arr] + 3[arr] + 4[arr] + 5[arr] + 6[arr] + 7[arr];
const __m256 t0 = _mm256_hadd_ps(*x, *x), t1 = _mm256_hadd_ps(t0, t0);
const __m128 t2 = _mm256_extractf128_ps(t1, 1), t3 = _mm_add_ss(_mm256_castps256_ps128(t1), t2);
return _mm_cvtss_f32(t3);
}
void mat_mul()
{
for (int bi = 0; bi < N; bi += BLOCK)
for (int bj = 0; bj < N; bj += BLOCK)
for (int bk = 0; bk < N; bk += BLOCK)
for (int i = bi; i < bi + BLOCK; i++)
for (int j = bj; j < bj + BLOCK; j++)
{
const __m256 dot = _mm256_mul_ps(_mm256_load_ps(a + i * N + bk), _mm256_load_ps(b + j * N + bk));
c[i * N + j] += reduce_sum(&dot);
}
}
int main()
{
FILE *fa = fopen("a.dat", "rb"), *fb = fopen("b.dat", "rb"), *fc = fopen("c.dat", "rb");
fread_unlocked(a, sizeof(float), sizeof a / sizeof(float), fa);
fread_unlocked(b, sizeof(float), sizeof b / sizeof(float), fb);
fread_unlocked(cp, sizeof(float), sizeof cp / sizeof(float), fc);
fclose(fa);
fclose(fb);
fclose(fc);
transpose();
for (int i = 0; i < 8; i++)
{
memset(c, 0, sizeof c);
mat_mul();
}
for (int i = 0; i < N; i++)
for (int j = 0; j < i; j++)
if (fabsf(c[i * N + j] - cp[i][j]) >= (fabsf(c[i * N + j]) < fabsf(cp[i][j]) ? fabsf(c[i * N + j]) : fabsf(cp[i][j])) * EPS)
{
fprintf(stderr, "Exceed epsilon!! (%d, %d): %f %f\n", i, j, c[i * N + j], cp[i][j]);
assert(0);
}
if (memcmp(c, cp, sizeof c))
fprintf(stderr, "Minor errors.\n");
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment