Skip to content

Instantly share code, notes, and snippets.

@mgopshtein
Last active May 27, 2018 02:47
Show Gist options
  • Save mgopshtein/4c263ee25e44f485f1976ecdc8697059 to your computer and use it in GitHub Desktop.
Save mgopshtein/4c263ee25e44f485f1976ecdc8697059 to your computer and use it in GitHub Desktop.
Performance: matrix multiply
#include <iostream>
#include <opencv2/core.hpp>
#include <chrono>
//#define EIGEN_USE_MKL_ALL
//#define MULTIPLY_ELEMENTWISE
#include <Eigen/Core>
constexpr auto DIMS = 1 << 10;
void randomInit(cv::Mat &m) {
cv::randu(m, 0.0f, 1.0f);
}
using namespace std::chrono;
std::pair<microseconds, microseconds> multiplyWithCublas(const cv::Mat& a, const cv::Mat& b, cv::Mat& c);
std::pair<microseconds, microseconds> custommultiply(const cv::Mat& a, const cv::Mat& b, cv::Mat& c);
void heatOpenCV() {
cv::Mat tmp1{ 2, 3, CV_32F };
cv::Mat tmp2{ 2, 3, CV_32F };
tmp1 * tmp2;
}
void multiplyEigen(const cv::Mat& a, const cv::Mat& b, cv::Mat& c) {
using EigenMatType = Eigen::Matrix<float, Eigen::Dynamic, Eigen::Dynamic>;
Eigen::Map<const EigenMatType> ma(a.ptr<float>(), DIMS, DIMS);
Eigen::Map<const EigenMatType> mb(b.ptr<float>(), DIMS, DIMS);
Eigen::Map<EigenMatType> mc(c.ptr<float>(), DIMS, DIMS);
#ifndef MULTIPLY_ELEMENTWISE
mc = ma * mb;
#else
mc = ma.array() * mb.array();
#endif
}
int main() {
//main11();
//return 11;
heatOpenCV();
const cv::Size size{ DIMS, DIMS };
cv::Mat m1{ size, CV_32F };
cv::Mat m2{ size, CV_32F };
randomInit(m1);
randomInit(m2);
cv::Mat m3{ size, CV_32F };
cv::Mat mtmp = m1 * m2;
auto measureMsec = [&](auto &f) {
auto ts1 = high_resolution_clock::now();
f();
auto ts2 = high_resolution_clock::now();
return duration_cast<microseconds>(ts2 - ts1).count();
};
long long total = 0;
for (auto i = 0; i < 10; i++) {
#ifndef MULTIPLY_ELEMENTWISE
auto msecOpenCV = measureMsec([&] {m3 = m1 * m2; });
#else
auto msecOpenCV = measureMsec([&] {m3 = m1.mul(m2);});
#endif
total += msecOpenCV;
std::cout << m3.at<float>(2, 2) << ": Open CV multiply took " << msecOpenCV << " micros\n";
}
std::cout << (total/10) << '\n';
multiplyEigen(m1, m2, m3);
m1.copyTo(m3);
total = 0;
for (auto i = 0; i < 10; i++) {
auto msecEigen = measureMsec([&] {multiplyEigen(m1, m2, m3); });
total += msecEigen;
std::cout << m3.at<float>(2, 2) << ": Eigen multiply took " << msecEigen << " micros\n";
}
std::cout << (total/10) << '\n';
#ifndef MULTIPLY_ELEMENTWISE
multiplyWithCublas(m1, m2, m3);
randomInit(m3);
for (auto i = 0; i < 10; i++) {
auto msceCublas = multiplyWithCublas(m1, m2, m3);
std::cout << m3.at<float>(2, 2) << ": CUBLAS multiply (no copy) took " << msceCublas.first.count() << " micros, with copy: " << msceCublas.second.count() << " micros\n";
}
#endif
return 0;
}
#include <opencv2/core.hpp>
#include <chrono>
#include <cuda_runtime.h>
#include <cublas_v2.h>
std::pair<std::chrono::microseconds, std::chrono::microseconds> multiplyWithCublas(const cv::Mat& a, const cv::Mat& b, cv::Mat& c) {
cublasHandle_t handle;
cublasStatus_t stat = cublasCreate(&handle);
cudaError_t cudaStat;
float* devPtrA;
float* devPtrB;
float* devPtrC;
cudaStat = cudaMalloc((void**)&devPtrA, a.dataend - a.datastart);
cudaStat = cudaMalloc((void**)&devPtrB, b.dataend - b.datastart);
cudaStat = cudaMalloc((void**)&devPtrC, c.dataend - c.datastart);
cudaStream_t streamId;
cublasGetStream(handle, &streamId);
auto ts1c = std::chrono::high_resolution_clock::now();
stat = cublasSetMatrix(a.rows, a.cols, sizeof(float), a.datastart, a.cols, devPtrA, a.cols);
stat = cublasSetMatrix(b.rows, b.cols, sizeof(float), b.datastart, b.cols, devPtrB, b.cols);
auto ts1 = std::chrono::high_resolution_clock::now();
// do the actual multiplication!
float alpha = 1.0f;
float beta = 0.0f;
stat = cublasSsymm(
handle,
CUBLAS_SIDE_LEFT,
CUBLAS_FILL_MODE_UPPER,
a.rows, a.cols,
&alpha,
devPtrA, a.cols,
devPtrB, b.cols,
&beta,
devPtrC, c.cols
);
cudaStreamSynchronize(streamId);
auto ts2 = std::chrono::high_resolution_clock::now();
stat = cublasGetMatrix(c.rows, c.cols, sizeof(float), devPtrC, c.cols, c.ptr<float>(), c.cols);
auto ts2c = std::chrono::high_resolution_clock::now();
cudaFree(devPtrA);
cudaFree(devPtrB);
cudaFree(devPtrC);
cublasDestroy(handle);
return {
std::chrono::duration_cast<std::chrono::microseconds>(ts2 - ts1),
std::chrono::duration_cast<std::chrono::microseconds>(ts2c - ts1c)
};
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment