Last active
May 27, 2018 02:47
-
-
Save mgopshtein/4c263ee25e44f485f1976ecdc8697059 to your computer and use it in GitHub Desktop.
Performance: matrix multiply
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #include <iostream> | |
| #include <opencv2/core.hpp> | |
| #include <chrono> | |
| //#define EIGEN_USE_MKL_ALL | |
| //#define MULTIPLY_ELEMENTWISE | |
| #include <Eigen/Core> | |
| constexpr auto DIMS = 1 << 10; | |
| void randomInit(cv::Mat &m) { | |
| cv::randu(m, 0.0f, 1.0f); | |
| } | |
| using namespace std::chrono; | |
| std::pair<microseconds, microseconds> multiplyWithCublas(const cv::Mat& a, const cv::Mat& b, cv::Mat& c); | |
| std::pair<microseconds, microseconds> custommultiply(const cv::Mat& a, const cv::Mat& b, cv::Mat& c); | |
| void heatOpenCV() { | |
| cv::Mat tmp1{ 2, 3, CV_32F }; | |
| cv::Mat tmp2{ 2, 3, CV_32F }; | |
| tmp1 * tmp2; | |
| } | |
| void multiplyEigen(const cv::Mat& a, const cv::Mat& b, cv::Mat& c) { | |
| using EigenMatType = Eigen::Matrix<float, Eigen::Dynamic, Eigen::Dynamic>; | |
| Eigen::Map<const EigenMatType> ma(a.ptr<float>(), DIMS, DIMS); | |
| Eigen::Map<const EigenMatType> mb(b.ptr<float>(), DIMS, DIMS); | |
| Eigen::Map<EigenMatType> mc(c.ptr<float>(), DIMS, DIMS); | |
| #ifndef MULTIPLY_ELEMENTWISE | |
| mc = ma * mb; | |
| #else | |
| mc = ma.array() * mb.array(); | |
| #endif | |
| } | |
| int main() { | |
| //main11(); | |
| //return 11; | |
| heatOpenCV(); | |
| const cv::Size size{ DIMS, DIMS }; | |
| cv::Mat m1{ size, CV_32F }; | |
| cv::Mat m2{ size, CV_32F }; | |
| randomInit(m1); | |
| randomInit(m2); | |
| cv::Mat m3{ size, CV_32F }; | |
| cv::Mat mtmp = m1 * m2; | |
| auto measureMsec = [&](auto &f) { | |
| auto ts1 = high_resolution_clock::now(); | |
| f(); | |
| auto ts2 = high_resolution_clock::now(); | |
| return duration_cast<microseconds>(ts2 - ts1).count(); | |
| }; | |
| long long total = 0; | |
| for (auto i = 0; i < 10; i++) { | |
| #ifndef MULTIPLY_ELEMENTWISE | |
| auto msecOpenCV = measureMsec([&] {m3 = m1 * m2; }); | |
| #else | |
| auto msecOpenCV = measureMsec([&] {m3 = m1.mul(m2);}); | |
| #endif | |
| total += msecOpenCV; | |
| std::cout << m3.at<float>(2, 2) << ": Open CV multiply took " << msecOpenCV << " micros\n"; | |
| } | |
| std::cout << (total/10) << '\n'; | |
| multiplyEigen(m1, m2, m3); | |
| m1.copyTo(m3); | |
| total = 0; | |
| for (auto i = 0; i < 10; i++) { | |
| auto msecEigen = measureMsec([&] {multiplyEigen(m1, m2, m3); }); | |
| total += msecEigen; | |
| std::cout << m3.at<float>(2, 2) << ": Eigen multiply took " << msecEigen << " micros\n"; | |
| } | |
| std::cout << (total/10) << '\n'; | |
| #ifndef MULTIPLY_ELEMENTWISE | |
| multiplyWithCublas(m1, m2, m3); | |
| randomInit(m3); | |
| for (auto i = 0; i < 10; i++) { | |
| auto msceCublas = multiplyWithCublas(m1, m2, m3); | |
| std::cout << m3.at<float>(2, 2) << ": CUBLAS multiply (no copy) took " << msceCublas.first.count() << " micros, with copy: " << msceCublas.second.count() << " micros\n"; | |
| } | |
| #endif | |
| return 0; | |
| } |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #include <opencv2/core.hpp> | |
| #include <chrono> | |
| #include <cuda_runtime.h> | |
| #include <cublas_v2.h> | |
| std::pair<std::chrono::microseconds, std::chrono::microseconds> multiplyWithCublas(const cv::Mat& a, const cv::Mat& b, cv::Mat& c) { | |
| cublasHandle_t handle; | |
| cublasStatus_t stat = cublasCreate(&handle); | |
| cudaError_t cudaStat; | |
| float* devPtrA; | |
| float* devPtrB; | |
| float* devPtrC; | |
| cudaStat = cudaMalloc((void**)&devPtrA, a.dataend - a.datastart); | |
| cudaStat = cudaMalloc((void**)&devPtrB, b.dataend - b.datastart); | |
| cudaStat = cudaMalloc((void**)&devPtrC, c.dataend - c.datastart); | |
| cudaStream_t streamId; | |
| cublasGetStream(handle, &streamId); | |
| auto ts1c = std::chrono::high_resolution_clock::now(); | |
| stat = cublasSetMatrix(a.rows, a.cols, sizeof(float), a.datastart, a.cols, devPtrA, a.cols); | |
| stat = cublasSetMatrix(b.rows, b.cols, sizeof(float), b.datastart, b.cols, devPtrB, b.cols); | |
| auto ts1 = std::chrono::high_resolution_clock::now(); | |
| // do the actual multiplication! | |
| float alpha = 1.0f; | |
| float beta = 0.0f; | |
| stat = cublasSsymm( | |
| handle, | |
| CUBLAS_SIDE_LEFT, | |
| CUBLAS_FILL_MODE_UPPER, | |
| a.rows, a.cols, | |
| &alpha, | |
| devPtrA, a.cols, | |
| devPtrB, b.cols, | |
| &beta, | |
| devPtrC, c.cols | |
| ); | |
| cudaStreamSynchronize(streamId); | |
| auto ts2 = std::chrono::high_resolution_clock::now(); | |
| stat = cublasGetMatrix(c.rows, c.cols, sizeof(float), devPtrC, c.cols, c.ptr<float>(), c.cols); | |
| auto ts2c = std::chrono::high_resolution_clock::now(); | |
| cudaFree(devPtrA); | |
| cudaFree(devPtrB); | |
| cudaFree(devPtrC); | |
| cublasDestroy(handle); | |
| return { | |
| std::chrono::duration_cast<std::chrono::microseconds>(ts2 - ts1), | |
| std::chrono::duration_cast<std::chrono::microseconds>(ts2c - ts1c) | |
| }; | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment