Last active
February 25, 2016 11:26
-
-
Save nandor/cbc43cc57ffb448253df to your computer and use it in GitHub Desktop.
SSE + AVX Challenge
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// AVX example. Compile with: | |
// clang++ simd.cc -O3 -osimd -lopencv_core -lopencv_highgui -lopencv_imgproc -std=c++14 -mavx -mfma | |
#include <chrono> | |
#include <iostream> | |
#include <immintrin.h> | |
#include <opencv2/opencv.hpp> | |
// Alignment requirement. | |
static constexpr int ALIGN = 32; | |
static_assert((ALIGN & (ALIGN - 1)) == 0, "ALIGN must be a power of two."); | |
// Variance of the Gaussian distribution. | |
static constexpr float SIGMA = 0.2f; | |
/** | |
* Generates an element of a gaussian distribution. | |
*/ | |
float gaussian(int i, int r) { | |
float x = i / r; | |
return exp(-(x * x) / (2 * SIGMA * SIGMA)); | |
} | |
static const float G[5] = { | |
gaussian(-2, 2), | |
gaussian(-1, 2), | |
gaussian( 0, 2), | |
gaussian(+1, 2), | |
gaussian(+2, 2) | |
}; | |
/** | |
* Rounds a number to a power of two. | |
*/ | |
template<typename T> | |
T round(const T& n, const T& r) { | |
assert((r & (r - 1)) == 0); | |
if ((n & (r - 1)) == 0) { | |
// If number is already a multiple of the alignment, return. | |
return n; | |
} else { | |
// Otherwise, increase the number and round off the last bits. | |
return (n + r) & ~(r - 1); | |
} | |
} | |
/** | |
* Aligns a pointer to an address that is a multiple of a given number. | |
*/ | |
template<typename T> | |
T *align(T* ptr, size_t N) { | |
union { | |
uintptr_t addr; | |
T *ptr; | |
} u; | |
u.ptr = ptr; | |
u.addr = round(u.addr, N); | |
return u.ptr; | |
} | |
/** | |
* Special image that aligns all rows to a 32 byte boundary, as required by | |
* VMOVAPS. It should be noted that the performance penalty from misaligned | |
* data here is not as significant as the penalty of using an unaligned 16 | |
* byte load. | |
*/ | |
class AlignedImage { | |
public: | |
/** | |
* Creates an empty image. | |
* | |
* The allocated buffer is 32 bytes larger than the requested buffer. | |
* This is done in order for us to be able to move to slide the start | |
* address of the image to the nearest address that is aligned to a 32 byte | |
* boundary, which happens to be at most 32 bytes apart. | |
*/ | |
AlignedImage(int rows, int cols) | |
: rows_(rows) | |
, cols_(cols) | |
, stride_(round(cols * 4, ALIGN)) | |
, data_(std::make_unique<uint8_t[]>(rows_ * stride_ + ALIGN)) | |
, image_(static_cast<void*>(align(data_.get(), ALIGN))) | |
{ | |
} | |
/** | |
* Creates the image by copying a CV matrix. | |
*/ | |
explicit AlignedImage(const cv::Mat &mat) | |
: AlignedImage(mat.rows, mat.cols) | |
{ | |
assert(mat.channels() == 1); | |
assert(mat.type() == CV_32FC1); | |
for (int i = 0; i < mat.rows; ++i) { | |
memcpy(this->operator[](i), mat.ptr<float>(i), mat.cols * sizeof(float)); | |
} | |
} | |
/** | |
* Converts the image to an OpenCV matrix. | |
*/ | |
operator cv::Mat () { | |
// Make sure to clone the data so the matrix can be used even if the | |
// image goes out of scope and the unique pointer is deallocated. | |
return cv::Mat(rows_, cols_, CV_32FC1, image_, stride_).clone(); | |
} | |
float *operator[] (size_t n) { | |
assert(n < rows_); | |
return static_cast<float*>(image_) + n * (stride_ / sizeof(float)); | |
} | |
const float *operator[] (size_t n) const { | |
return (const_cast<AlignedImage*>(this))->operator[](n); | |
} | |
// Accessors for properties. | |
int getRows() const { return rows_; } | |
int getCols() const { return cols_; } | |
private: | |
// Size of the image. | |
int rows_; | |
int cols_; | |
// The stride represents the length of a single row, multiple of 32 bytes. | |
int stride_; | |
// Raw, unaligned data. | |
std::unique_ptr<uint8_t[]> data_; | |
// 32-byte aligned offset in the buffer. | |
void *image_; | |
}; | |
/** | |
* Applies the bilateral filter on an image. | |
* | |
* This implementation follows the one from kfusion: | |
* https://github.com/GerhardR/kfusion/blob/master/kfusion.cu#L133 | |
*/ | |
void bilateralFilterNaive(const AlignedImage &in, AlignedImage &out) { | |
assert(in.getRows() == out.getRows()); | |
assert(in.getCols() == out.getCols()); | |
for (int r = 2; r < in.getRows() - 2; ++r) { | |
for (int c = 2; c < in.getCols() - 2; ++c) { | |
float center = in[r][c]; | |
float accum = 0.0f; | |
float weight = 0.0f; | |
for (int i = -2; i <= 2; ++i) { | |
for (int j = -2; j <= 2; ++j) { | |
float pix = in[r + i][c + j]; | |
float d = pix - center; | |
float w = G[i + 2] * G[j + 2] * exp(-(d * d) / (2 * SIGMA * SIGMA)); | |
accum += w * pix; | |
weight += w; | |
} | |
} | |
out[r][c] = accum / weight; | |
} | |
} | |
} | |
/** | |
* Applies an AVX optimized bilateral filter to an image. | |
*/ | |
void bilateralFilterAVX(const AlignedImage &in, AlignedImage &out) { | |
assert(in.getRows() == out.getRows()); | |
assert(in.getCols() == out.getCols()); | |
for (int r = 2; r < in.getRows() - 2; ++r) { | |
for (int c = 2; c < in.getCols() - 2; ++c) { | |
out[r][c] = in[r][c]; | |
} | |
} | |
} | |
/** | |
* Entry point of the application. Loads an image & smooths it. | |
*/ | |
int main(int argc, char **argv) { | |
if (argc != 3) { | |
std::cerr << "Usage: simd [image] [avx|naive]" << std::endl; | |
return EXIT_FAILURE; | |
} | |
// Load the image using OpenCV and convert it to grayscale float. | |
cv::Mat gray; | |
{ | |
const auto rgb = cv::imread(argv[1], cv::IMREAD_COLOR); | |
cv::cvtColor(rgb, gray, CV_BGR2GRAY); | |
gray.convertTo(gray, CV_32FC1); | |
gray = gray / 255.0f; | |
} | |
// Create the input and output images. | |
AlignedImage input(gray); | |
AlignedImage output(gray.rows, gray.cols); | |
// Run the bilateral filter. | |
{ | |
const auto start = std::chrono::high_resolution_clock::now(); | |
{ | |
const std::string method(argv[2]); | |
if (method == "naive") { | |
bilateralFilterNaive(input, output); | |
} else if (method == "avx") { | |
bilateralFilterAVX(input, output); | |
} else { | |
std::cerr << "Invalid method." << std::endl; | |
return EXIT_FAILURE; | |
} | |
} | |
const auto end = std::chrono::high_resolution_clock::now(); | |
std::cout | |
<< "Duration: " | |
<< (std::chrono::duration<double>(end - start)).count() | |
<< std::endl; | |
} | |
// Show the output image. | |
cv::imshow("input", (cv::Mat)input); | |
cv::imshow("output", (cv::Mat)output); | |
while (cv::waitKey(0) != 'q'); | |
return EXIT_SUCCESS; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Add
#include <memory>
forstd::unique_ptr<>