Created
February 26, 2022 00:01
-
-
Save SharanSMenon/d4c35902680844273106cd7b6f96fa79 to your computer and use it in GitHub Desktop.
SGEMM with Accelerate Frame in C++ (macOS)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include<stdio.h> | |
#include<iostream> | |
#include<Accelerate/Accelerate.h> | |
#include <random> | |
#include <stdlib.h> | |
#include <vector> | |
#include <chrono> | |
void fill(float *a, int n) { | |
for (int i = 0; i < n; i++) { | |
a[i] = rand() % 100; | |
} | |
} | |
int main() { | |
std::cout << "Hello, World!" << std::endl; | |
CBLAS_ORDER order = CblasRowMajor; | |
CBLAS_TRANSPOSE trans = CblasNoTrans; | |
int i = 600; | |
int j = 600; | |
int k = 600; | |
float* A = (float*)malloc(sizeof(float) * (i*j)); | |
float* B = (float*)malloc(sizeof(float) * (j*k)); | |
float* C = (float*)malloc(sizeof(float) * (i*k)); | |
fill(A, i*j); | |
fill(B, j*k); | |
// std::copy(A, A + i*j, std::ostream_iterator<int>(std::cout, " ")); | |
// std::copy(B, B + j*k, std::ostream_iterator<int>(std::cout, " ")); | |
float ALPHA = 1.0; | |
float BETA = 0.0; | |
int iterations = 25; | |
auto start = std::chrono::high_resolution_clock::now(); | |
for (int m = 0; m < iterations; m++) { | |
cblas_sgemm(order, CblasNoTrans, CblasNoTrans, i, j, k, ALPHA, A, j, B, k, BETA, C, k); | |
} | |
auto stop = std::chrono::high_resolution_clock::now(); | |
auto duration = std::chrono::duration_cast<std::chrono::microseconds>(stop - start); | |
std::cout << duration.count() << " microseconds" << std::endl; | |
// std::cout << n <<std::endl; | |
// std::copy(C, C + i*k, | |
// std::ostream_iterator<int>(std::cout, " ")); | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Sorry for some of the bad code here, I am not a C++ expert and some of this code can probably be optimized to be like 10x faster (especially the matrix creation).