Last active
November 27, 2023 14:27
-
-
Save nevikw39/471179833ab2ee95a847a96b83e3d036 to your computer and use it in GitHub Desktop.
Advanced High Performance Computing Cluster Practice HW2
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <iostream> | |
#include <random> | |
using namespace std; | |
constexpr int N = 1024; | |
float a[1024][1024], b[1024][1024]; | |
int main() | |
{ | |
random_device rd; | |
mt19937 mt(rd()); | |
uniform_real_distribution<float> urd(0, 1); | |
for (auto &i : a) | |
for (auto &j : i) | |
j = urd(mt); | |
for (auto &i : b) | |
for (auto &j : i) | |
j = urd(mt); | |
FILE *fa = fopen("a.dat", "wb"), *fb = fopen("b.dat", "wb"); | |
fwrite_unlocked(a, sizeof(float), sizeof a / sizeof(float), fa); | |
fwrite_unlocked(b, sizeof(float), sizeof b / sizeof(float), fb); | |
fclose(fa); | |
fclose(fb); | |
return 0; | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
CC = icc | |
CFLAGS = -O0 -g -fp-model strict -fsanitize=address -xCORE-AVX512 | |
CXX = icpc | |
BINS = mat_mul_a_naive mat_mul_b_transpose mat_mul_c_block mat_mul_d_avx | |
SRCS = $(addsuffix .c,$(BINS)) | |
DATS = a.dat b.dat | |
.PHONY: all clean distclean | |
all: $(BINS) | |
%: %.c | $(DATS) | |
$(CC) $< -o $@ $(CFLAGS) | |
$(DATS): | gen | |
./gen | |
gen: gen.cpp | |
$(CXX) $< -o $@ | |
clean: | |
@-rm $(BINS) | |
distclean: clean | |
@-rm *.dat gen |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <stdio.h> | |
#include <string.h> | |
#define N 1024 | |
float a[N][N], b[N][N], c[N][N]; | |
void mat_mul() | |
{ | |
for (int i = 0; i < N; i++) | |
for (int j = 0; j < N; j++) | |
for (int k = 0; k < N; k++) | |
c[i][j] += a[i][k] * b[k][j]; | |
} | |
int main() | |
{ | |
FILE *fa = fopen("a.dat", "rb"), *fb = fopen("b.dat", "rb"), *fc = fopen("c.dat", "wb"); | |
fread_unlocked(a, sizeof(float), sizeof a / sizeof(float), fa); | |
fread_unlocked(b, sizeof(float), sizeof b / sizeof(float), fb); | |
fclose(fa); | |
fclose(fb); | |
for (int i = 0; i < 8; i++) | |
{ | |
memset(c, 0, sizeof c); | |
mat_mul(); | |
} | |
fwrite_unlocked(c, sizeof(float), sizeof c / sizeof(float), fc); | |
fclose(fc); | |
return 0; | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <assert.h> | |
#include <stdio.h> | |
#include <string.h> | |
#define N 1024 | |
float a[N][N], b[N][N], c[N][N], cp[N][N]; | |
void transpose() | |
{ | |
for (int i = 0; i < N; i++) | |
for (int j = 0; j < i; j++) | |
{ | |
float tmp = b[i][j]; | |
b[i][j] = b[j][i]; | |
b[j][i] = tmp; | |
} | |
} | |
void mat_mul() | |
{ | |
for (int i = 0; i < N; i++) | |
for (int j = 0; j < N; j++) | |
for (int k = 0; k < N; k++) | |
c[i][j] += a[i][k] * b[j][k]; | |
} | |
int main() | |
{ | |
FILE *fa = fopen("a.dat", "rb"), *fb = fopen("b.dat", "rb"), *fc = fopen("c.dat", "rb"); | |
fread_unlocked(a, sizeof(float), sizeof a / sizeof(float), fa); | |
fread_unlocked(b, sizeof(float), sizeof b / sizeof(float), fb); | |
fread_unlocked(cp, sizeof(float), sizeof cp / sizeof(float), fc); | |
fclose(fa); | |
fclose(fb); | |
fclose(fc); | |
transpose(); | |
for (int i = 0; i < 8; i++) | |
{ | |
memset(c, 0, sizeof c); | |
mat_mul(); | |
} | |
assert(!memcmp(c, cp, sizeof c)); | |
return 0; | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <assert.h> | |
#include <stdio.h> | |
#include <string.h> | |
#define N 1024 | |
#define BLOCK 8 | |
float a[N][N], b[N][N], c[N][N], cp[N][N]; | |
void transpose() | |
{ | |
for (int i = 0; i < N; i++) | |
for (int j = 0; j < i; j++) | |
{ | |
float tmp = b[i][j]; | |
b[i][j] = b[j][i]; | |
b[j][i] = tmp; | |
} | |
} | |
void mat_mul() | |
{ | |
for (int bi = 0; bi < N; bi += BLOCK) | |
for (int bj = 0; bj < N; bj += BLOCK) | |
for (int bk = 0; bk < N; bk += BLOCK) | |
for (int i = bi; i < bi + BLOCK; i++) | |
for (int j = bj; j < bj + BLOCK; j++) | |
for (int k = bk; k < bk + BLOCK; k++) | |
c[i][j] += a[i][k] * b[j][k]; | |
} | |
int main() | |
{ | |
FILE *fa = fopen("a.dat", "rb"), *fb = fopen("b.dat", "rb"), *fc = fopen("c.dat", "rb"); | |
fread_unlocked(a, sizeof(float), sizeof a / sizeof(float), fa); | |
fread_unlocked(b, sizeof(float), sizeof b / sizeof(float), fb); | |
fread_unlocked(cp, sizeof(float), sizeof cp / sizeof(float), fc); | |
fclose(fa); | |
fclose(fb); | |
fclose(fc); | |
transpose(); | |
for (int i = 0; i < 8; i++) | |
{ | |
memset(c, 0, sizeof c); | |
mat_mul(); | |
} | |
assert(!memcmp(c, cp, sizeof c)); | |
return 0; | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <assert.h> | |
#include <immintrin.h> | |
#include <math.h> | |
#include <stdio.h> | |
#include <string.h> | |
#define N 1024 | |
#define BLOCK 8 | |
#define EPS 2.5e-6f | |
float a[N * N], b[N * N], c[N * N], cp[N][N]; | |
void transpose() | |
{ | |
for (int i = 0; i < N; i++) | |
for (int j = 0; j < i; j++) | |
{ | |
float tmp = b[i * N + j]; | |
b[i * N + j] = b[j * N + i]; | |
b[j * N + i] = tmp; | |
} | |
} | |
static inline float reduce_sum(const __m256 *const x) | |
{ | |
// static float arr[8]; | |
// _mm256_store_ps(arr, x); | |
// return *arr + 1[arr] + 2[arr] + 3[arr] + 4[arr] + 5[arr] + 6[arr] + 7[arr]; | |
const __m256 t0 = _mm256_hadd_ps(*x, *x), t1 = _mm256_hadd_ps(t0, t0); | |
const __m128 t2 = _mm256_extractf128_ps(t1, 1), t3 = _mm_add_ss(_mm256_castps256_ps128(t1), t2); | |
return _mm_cvtss_f32(t3); | |
} | |
void mat_mul() | |
{ | |
for (int bi = 0; bi < N; bi += BLOCK) | |
for (int bj = 0; bj < N; bj += BLOCK) | |
for (int bk = 0; bk < N; bk += BLOCK) | |
for (int i = bi; i < bi + BLOCK; i++) | |
for (int j = bj; j < bj + BLOCK; j++) | |
{ | |
const __m256 dot = _mm256_mul_ps(_mm256_load_ps(a + i * N + bk), _mm256_load_ps(b + j * N + bk)); | |
c[i * N + j] += reduce_sum(&dot); | |
} | |
} | |
int main() | |
{ | |
FILE *fa = fopen("a.dat", "rb"), *fb = fopen("b.dat", "rb"), *fc = fopen("c.dat", "rb"); | |
fread_unlocked(a, sizeof(float), sizeof a / sizeof(float), fa); | |
fread_unlocked(b, sizeof(float), sizeof b / sizeof(float), fb); | |
fread_unlocked(cp, sizeof(float), sizeof cp / sizeof(float), fc); | |
fclose(fa); | |
fclose(fb); | |
fclose(fc); | |
transpose(); | |
for (int i = 0; i < 8; i++) | |
{ | |
memset(c, 0, sizeof c); | |
mat_mul(); | |
} | |
for (int i = 0; i < N; i++) | |
for (int j = 0; j < i; j++) | |
if (fabsf(c[i * N + j] - cp[i][j]) >= (fabsf(c[i * N + j]) < fabsf(cp[i][j]) ? fabsf(c[i * N + j]) : fabsf(cp[i][j])) * EPS) | |
{ | |
fprintf(stderr, "Exceed epsilon!! (%d, %d): %f %f\n", i, j, c[i * N + j], cp[i][j]); | |
assert(0); | |
} | |
if (memcmp(c, cp, sizeof c)) | |
fprintf(stderr, "Minor errors.\n"); | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment