Skip to content

Instantly share code, notes, and snippets.

@chikuzen
Last active April 19, 2017 11:49
Show Gist options
  • Save chikuzen/621af67a8aff2cb8d0b997d2bf5f29b3 to your computer and use it in GitHub Desktop.
Save chikuzen/621af67a8aff2cb8d0b997d2bf5f29b3 to your computer and use it in GitHub Desktop.
#include <cstdint>
#include <algorithm>
#include <iostream>
#include <tmmintrin.h>
// pattern0: (R + G + B) / 3
// pattern1: max(R, G, B)
// pattern2: (R + 2*G + B) / 4
// pattern3: R*0.299 + G*0.587 + B*0.114
struct RGBA {
uint8_t b, g, r, a;
};
enum CPU: int {
HAS_SSE2 = 0,
HAS_SSSE3,
};
template <CPU ARCH>
static inline void load_and_shuffle(const uint8_t* src, __m128i& rg, __m128i& ba)
{
auto t0 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src));
auto t1 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src + 16));
if (ARCH < HAS_SSSE3) {
auto t2 = _mm_unpacklo_epi8(t0, t1); // r0r4g0g4b0b4a0a4r1r5g1g5b1b5a1a5
auto t3 = _mm_unpackhi_epi8(t0, t1); // r2r6g2g6b2b6a2a6r3r7g3g7b3b7a3a7
t0 = _mm_unpacklo_epi8(t2, t3); // r0r2r4r6g0g2g4g6b0b2b4b6a0a2a4a6
t1 = _mm_unpackhi_epi8(t2, t3); // r1r3r5r7g1g3g5g7b1b3b5b7a1a3a5a7
rg = _mm_unpacklo_epi8(t0, t1); // rrrrrrrrgggggggg
ba = _mm_unpackhi_epi8(t0, t1); // bbbbbbbbaaaaaaaa
} else {
const __m128i shmask = _mm_setr_epi8(0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15);
t0 = _mm_shuffle_epi8(t0, shmask); // rrrrggggbbbbaaaa
t1 = _mm_shuffle_epi8(t1, shmask); // rrrrggggbbbbaaaa
rg = _mm_unpacklo_epi32(t0, t1); // rrrrrrrrgggggggg
ba = _mm_unpackhi_epi32(t0, t1); // bbbbbbbbaaaaaaaa
}
}
template <CPU ARCH>
static inline __m128i hadd_epi32(const __m128i& a, const __m128i& b)
{
if (ARCH < HAS_SSSE3) {
auto t0 = _mm_shuffle_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b), _MM_SHUFFLE(2, 0, 2, 0));
auto t1 = _mm_shuffle_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b), _MM_SHUFFLE(3, 1, 3, 1));
return _mm_add_epi32(_mm_castps_si128(t0), _mm_castps_si128(t1));
} else {
return _mm_hadd_epi32(a, b);
}
}
// srcとdstのstrideは等しいものとする
void pattern0_c(const uint8_t* src, uint8_t* dst, const size_t width, const size_t height, const size_t stride)
{
constexpr auto coef = static_cast<int>(32768.0 / 3 + 0.5); // 10923
for (size_t y = 0; y < height; ++y) {
auto s = reinterpret_cast<const RGBA*>(src);
auto d = reinterpret_cast<RGBA*>(dst);
for (size_t x = 0; x < width; ++x) {
int v = ((s[x].r + s[x].g + s[x].b) * coef + 16384) >> 15;
d[x].r = d[x].g = d[x].b = static_cast<uint8_t>(v);
d[x].a = s[x].a;
}
src += stride;
dst += stride;
}
}
// めんどいのでstrideは常に32の倍数とする
template <CPU ARCH>
void pattern0_simd(const uint8_t* src, uint8_t* dst, const size_t width, const size_t height, const size_t stride)
{
constexpr int16_t c = static_cast<int16_t>(32768.0 / 3 + 0.5);
const __m128i coef = _mm_setr_epi16(c, 1, c, 1, c, 1, c, 1);
const __m128i round = _mm_set1_epi16(16384);
const __m128i zero = _mm_setzero_si128();
for (size_t y = 0; y < height; ++y) {
for (size_t x = 0; x < width * 4; x += 32) {
__m128i rg, ba;
load_and_shuffle<ARCH>(src + x, rg, ba);
auto rgb0 = _mm_add_epi16(_mm_unpacklo_epi8(rg, zero), _mm_unpackhi_epi8(rg, zero));
rgb0 = _mm_add_epi16(rgb0, _mm_unpacklo_epi8(ba, zero));
auto rgb1 = _mm_unpackhi_epi16(rgb0, round);
rgb0 = _mm_unpacklo_epi16(rgb0, round);
rgb0 = _mm_madd_epi16(rgb0, coef);
rgb1 = _mm_madd_epi16(rgb1, coef);
rgb0 = _mm_srli_epi32(rgb0, 15);
rgb1 = _mm_srli_epi32(rgb1, 15);
rgb0 = _mm_packs_epi32(rgb0, rgb1);
rgb0 = _mm_packus_epi16(rgb0, rgb0);
rg = _mm_unpacklo_epi8(rgb0, rgb0);
ba = _mm_unpacklo_epi8(rgb0, _mm_srli_si128(ba, 8));
rgb0 = _mm_unpacklo_epi16(rg, ba);
rgb1 = _mm_unpackhi_epi16(rg, ba);
_mm_storeu_si128(reinterpret_cast<__m128i*>(dst + x), rgb0);
_mm_storeu_si128(reinterpret_cast<__m128i*>(dst + x + 16), rgb1);
}
src += stride;
dst += stride;
}
}
void pattern1_c(const uint8_t* src, uint8_t* dst, const size_t width, const size_t height, const size_t stride)
{
for (size_t y = 0; y < height; ++y) {
auto s = reinterpret_cast<const RGBA*>(src);
auto d = reinterpret_cast<RGBA*>(dst);
for (size_t x = 0; x < width; ++x) {
int v = std::max(std::max(s[x].r, s[x].g), s[x].b);
d[x].r = d[x].g = d[x].b = static_cast<uint8_t>(v);
d[x].a = s[x].a;
}
src += stride;
dst += stride;
}
}
template <CPU ARCH>
void pattern1_simd(const uint8_t* src, uint8_t* dst, const size_t width, const size_t height, const size_t stride)
{
for (size_t y = 0; y < height; ++y) {
for (size_t x = 0; x < width * 4; x += 32) {
__m128i rg, ba;
load_and_shuffle<ARCH>(src + x, rg, ba);
auto maximum = _mm_max_epu8(rg, _mm_srli_si128(rg, 8));
maximum = _mm_max_epu8(maximum, ba);
rg = _mm_unpacklo_epi8(maximum, maximum);
ba = _mm_unpacklo_epi8(maximum, _mm_srli_si128(ba, 8));
_mm_storeu_si128(reinterpret_cast<__m128i*>(dst + x), _mm_unpacklo_epi16(rg, ba));
_mm_storeu_si128(reinterpret_cast<__m128i*>(dst + x + 16), _mm_unpackhi_epi16(rg, ba));
}
src += stride;
dst += stride;
}
}
void pattern2_c(const uint8_t* src, uint8_t* dst, const size_t width, const size_t height, const size_t stride)
{
for (size_t y = 0; y < height; ++y) {
auto s = reinterpret_cast<const RGBA*>(src);
auto d = reinterpret_cast<RGBA*>(dst);
for (size_t x = 0; x < width; ++x) {
int v = (s[x].r + s[x].g + s[x].g + s[x].b + 2) / 4;
d[x].r = d[x].g = d[x].b = static_cast<uint8_t>(v);
d[x].a = s[x].a;
}
src += stride;
dst += stride;
}
}
template <CPU ARCH>
void pattern2_simd(const uint8_t* src, uint8_t* dst, const size_t width, const size_t height, const size_t stride)
{
for (size_t y = 0; y < height; ++y) {
for (size_t x = 0; x < width * 4; x += 32) {
__m128i rg, ba;
load_and_shuffle<ARCH>(src + x, rg, ba);
auto tmp = _mm_srli_si128(rg, 8);
rg = _mm_avg_epu8(rg, tmp);
tmp = _mm_subs_epu8(_mm_avg_epu8(tmp, ba), _mm_set1_epi8(1));
tmp = _mm_avg_epu8(tmp, rg);
rg = _mm_unpacklo_epi8(tmp, tmp);
ba = _mm_unpacklo_epi8(tmp, _mm_srli_si128(ba, 8));
_mm_storeu_si128(reinterpret_cast<__m128i*>(dst + x), _mm_unpacklo_epi16(rg, ba));
_mm_storeu_si128(reinterpret_cast<__m128i*>(dst + x + 16), _mm_unpackhi_epi16(rg, ba));
}
src += stride;
dst += stride;
}
}
void pattern3_c(const uint8_t* src, uint8_t* dst, const size_t width, const size_t height, const size_t stride)
{
constexpr auto coef_r = static_cast<int>(0.299 * 32768 + 0.5);
constexpr auto coef_g = static_cast<int>(0.587 * 32768 + 0.5);
constexpr auto coef_b = static_cast<int>(0.114 * 32768 + 0.5);
for (size_t y = 0; y < height; ++y) {
auto s = reinterpret_cast<const RGBA*>(src);
auto d = reinterpret_cast<RGBA*>(dst);
for (size_t x = 0; x < width; ++x) {
int v = (s[x].r * coef_r + s[x].g * coef_g + s[x].b * coef_b + 16384) >> 15;
d[x].r = d[x].g = d[x].b = static_cast<uint8_t>(v);
d[x].a = s[x].a;
}
src += stride;
dst += stride;
}
}
template <CPU ARCH>
void pattern3_simd(const uint8_t* src, uint8_t* dst, const size_t width, const size_t height, const size_t stride)
{
constexpr auto cr = static_cast<int16_t>(0.299 * 32768 + 0.5);
constexpr auto cg = static_cast<int16_t>(0.587 * 32768 + 0.5);
constexpr auto cb = static_cast<int16_t>(0.114 * 32768 + 0.5);
const __m128i coef = _mm_setr_epi16(cb, cg, cr, 0, cb, cg, cr, 0);
const __m128i zero = _mm_setzero_si128();
for (size_t y = 0; y < height; ++y) {
for (size_t x = 0; x < width * 4; x += 16) {
auto s = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src + x));
auto rgba0 = _mm_unpacklo_epi8(s, zero);
auto rgba1 = _mm_unpackhi_epi8(s, zero);
rgba0 = _mm_madd_epi16(rgba0, coef);
rgba1 = _mm_madd_epi16(rgba1, coef);
rgba0 = hadd_epi32<ARCH>(rgba0, rgba1);
rgba0 = _mm_add_epi32(rgba0, _mm_set1_epi32(16784));
rgba0 = _mm_srli_epi32(rgba0, 15);
rgba0 = _mm_or_si128(rgba0, _mm_slli_epi32(rgba0, 8));
rgba0 = _mm_or_si128(rgba0, _mm_slli_epi32(rgba0, 8));
rgba0 = _mm_or_si128(rgba0, _mm_and_si128(s, _mm_set1_epi32(0xFF000000)));
_mm_storeu_si128(reinterpret_cast<__m128i*>(dst + x), rgba0);
}
src += stride;
dst += stride;
}
}
void pattern0f_c(const uint8_t* src, uint8_t* dst, const size_t width, const size_t height, const size_t stride)
{
for (size_t y = 0; y < height; ++y) {
auto s = reinterpret_cast<const RGBA*>(src);
auto d = reinterpret_cast<RGBA*>(dst);
for (size_t x = 0; x < width; ++x) {
float v = (s[x].r + s[x].g + s[x].b) * 0.33333f + 0.5f;
d[x].r = d[x].g = d[x].b = static_cast<uint8_t>(v);
d[x].a = s[x].a;
}
src += stride;
dst += stride;
}
}
void pattern0f_simd(const uint8_t* src, uint8_t* dst, const size_t width, const size_t height, const size_t stride)
{
const __m128i mask = _mm_set1_epi32(0x000000FF);
for (size_t y = 0; y < height; ++y) {
for (size_t x = 0; x < width * 4; x += 16) {
auto s = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src + x));
auto rgbi = _mm_add_epi32(_mm_and_si128(mask, s), _mm_and_si128(mask, _mm_srli_epi32(s, 8)));
rgbi = _mm_add_epi32(rgbi, _mm_and_si128(mask, _mm_srli_epi32(s, 16)));
rgbi = _mm_cvtps_epi32(_mm_mul_ps(_mm_cvtepi32_ps(rgbi), _mm_set1_ps(0.33333f)));
rgbi = _mm_or_si128(rgbi, _mm_slli_epi32(rgbi, 8));
rgbi = _mm_or_si128(rgbi, _mm_slli_epi32(rgbi, 8));
rgbi = _mm_or_si128(rgbi, _mm_and_si128(s, _mm_set1_epi32(0xFF000000)));
_mm_storeu_si128(reinterpret_cast<__m128i*>(dst + x), rgbi);
}
src += stride;
dst += stride;
}
}
void pattern3f_c(const uint8_t* src, uint8_t* dst, const size_t width, const size_t height, const size_t stride)
{
for (size_t y = 0; y < height; ++y) {
auto s = reinterpret_cast<const RGBA*>(src);
auto d = reinterpret_cast<RGBA*>(dst);
for (size_t x = 0; x < width; ++x) {
auto v = (s[x].r * 0.299f + s[x].g * 0.587f + s[x].b * 0.114f + 0.5f);
d[x].r = d[x].g = d[x].b = static_cast<uint8_t>(v);
d[x].a = s[x].a;
}
src += stride;
dst += stride;
}
}
void pattern3f_simd(const uint8_t* src, uint8_t* dst, const size_t width, const size_t height, const size_t stride)
{
const __m128i mask = _mm_set1_epi32(0x000000FF);
for (size_t y = 0; y < height; ++y) {
for (size_t x = 0; x < width * 4; x += 16) {
auto s = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src + x));
auto bf = _mm_mul_ps(_mm_cvtepi32_ps(_mm_and_si128(mask, s)), _mm_set1_ps(0.114f));
auto gf = _mm_mul_ps(_mm_cvtepi32_ps(_mm_and_si128(mask, _mm_srli_epi32(s, 8))), _mm_set1_ps(0.587f));
auto rf = _mm_mul_ps(_mm_cvtepi32_ps(_mm_and_si128(mask, _mm_srli_epi32(s, 16))), _mm_set1_ps(0.299f));
auto rgbi = _mm_cvtps_epi32(_mm_add_ps(_mm_add_ps(bf, gf), rf));
rgbi = _mm_or_si128(rgbi, _mm_slli_epi32(rgbi, 8));
rgbi = _mm_or_si128(rgbi, _mm_slli_epi32(rgbi, 8));
rgbi = _mm_or_si128(rgbi, _mm_and_si128(s, _mm_set1_epi32(0xFF000000)));
_mm_storeu_si128(reinterpret_cast<__m128i*>(dst + x), rgbi);
}
src += stride;
dst += stride;
}
}
#if defined(BUILD_AVS_PLUGIN)
#define WIN32_LEAN_AND_MEAN
#define NOMINMAX
#include <windows.h>
#include <avisynth.h>
class MyGrey: public GenericVideoFilter {
const int mode;
public:
MyGrey(PClip c, int m) : GenericVideoFilter(c), mode(m)
{
std::cerr << "mode: " << mode << std::endl;
}
~MyGrey() {}
PVideoFrame __stdcall GetFrame(int n, IScriptEnvironment* env)
{
constexpr CPU ARCH = HAS_SSSE3;
auto src = child->GetFrame(n, env);
auto dst = env->NewVideoFrame(vi);
auto srcp = src->GetReadPtr();
auto dstp = dst->GetWritePtr();
size_t stride = dst->GetPitch();
switch (mode) {
case 0:
pattern0_c(srcp, dstp, vi.width, vi.height, stride);
break;
case 1:
pattern0_simd<ARCH>(srcp, dstp, vi.width, vi.height, stride);
break;
case 2:
pattern1_c(srcp, dstp, vi.width, vi.height, stride);
break;
case 3:
pattern1_simd<ARCH>(srcp, dstp, vi.width, vi.height, stride);
break;
case 4:
pattern2_c(srcp, dstp, vi.width, vi.height, stride);
break;
case 5:
pattern2_simd<ARCH>(srcp, dstp, vi.width, vi.height, stride);
break;
case 6:
pattern3_c(srcp, dstp, vi.width, vi.height, stride);
break;
case 7:
pattern3_simd<ARCH>(srcp, dstp, vi.width, vi.height, stride);
break;
case 8:
pattern0f_c(srcp, dstp, vi.width, vi.height, stride);
break;
case 9:
pattern0f_simd<ARCH>(srcp, dstp, vi.width, vi.height, stride);
break;
case 10:
pattern3f_c(srcp, dstp, vi.width, vi.height, stride);
break;
default:
pattern3f_simd<ARCH>(srcp, dstp, vi.width, vi.height, stride);
}
return dst;
}
};
static AVSValue __cdecl create(AVSValue args, void*, IScriptEnvironment* env)
{
PClip c = args[0].AsClip();
const VideoInfo& vi = c->GetVideoInfo();
if (!vi.IsRGB32()) {
env->ThrowError("not RGB32");
}
int pitch = (vi.width * 4 + 15) & ~15;
if (pitch % 32 != 0) {
env->ThrowError("pitch must be mod32");
}
int mode = std::min(std::max(args[1].AsInt(0), 0), 11);
return new MyGrey(c, mode);
}
const AVS_Linkage* AVS_linkage = nullptr;
extern "C" __declspec(dllexport) const char* __stdcall
AvisynthPluginInit3(IScriptEnvironment* env, const AVS_Linkage* const vectors)
{
AVS_linkage = vectors;
env->AddFunction("MyGrey", "c[mode]i", create, nullptr);
return nullptr;
}
#else
#include <random>
#include <chrono>
#include <vector>
#include <utility>
#include <stdexcept>
class RGBAFrame {
void* buff;
const size_t width;
const size_t height;
const size_t stride;
public:
RGBAFrame(size_t w, size_t h): width(w), height(h), stride((w * 4 + 31) & ~31)
{
buff = _mm_malloc(stride * height, 16);
if (!buff) {
throw std::runtime_error("failed to allocate memory");
}
}
~RGBAFrame() { _mm_free(buff); buff = nullptr; }
uint8_t* getPtr() const { return static_cast<uint8_t*>(buff); }
const size_t getStride() const { return stride; }
void fill_random()
{
std::random_device s;
std::mt19937 gen(s());
std::uniform_int_distribution<> dist(INT32_MIN, INT32_MAX);
size_t size = stride * height / sizeof(int32_t);
auto orig = static_cast<int32_t*>(buff);
for (size_t i = 0; i < size; ++i) {
orig[i] = dist(gen);
}
}
};
int main(void)
{
using namespace std::chrono;
using std::make_pair;
constexpr CPU ARCH = HAS_SSSE3;
constexpr int num = 100;
const size_t w = 1920, h = 1080;
auto src = RGBAFrame(w, h);
auto dst = RGBAFrame(w, h);
src.fill_random();
uint8_t *s = src.getPtr(), *d = dst.getPtr();
auto st = src.getStride();
std::vector<std::pair<const char*, duration<int64_t, std::nano>>> duration;
duration.reserve(12);
auto now = high_resolution_clock::now();
for (int i = 0; i < num; ++i) pattern0_c(s, d, w, h, st);
duration.emplace_back(make_pair("0-c", high_resolution_clock::now() - now));
now = high_resolution_clock::now();
for (int i = 0; i < num; ++i) pattern0_simd<ARCH>(s, d, w, h, st);
duration.emplace_back(make_pair("0-simd", high_resolution_clock::now() - now));
now = high_resolution_clock::now();
for (int i = 0; i < num; ++i) pattern1_c(s, d, w, h, st);
duration.emplace_back(make_pair("1-c", high_resolution_clock::now() - now));
now = high_resolution_clock::now();
for (int i = 0; i < num; ++i) pattern1_simd<ARCH>(s, d, w, h, st);
duration.emplace_back(make_pair("1-simd", high_resolution_clock::now() - now));
now = high_resolution_clock::now();
for (int i = 0; i < num; ++i) pattern2_c(s, d, w, h, st);
duration.emplace_back(make_pair("2-c", high_resolution_clock::now() - now));
now = high_resolution_clock::now();
for (int i = 0; i < num; ++i) pattern2_simd<ARCH>(s, d, w, h, st);
duration.emplace_back(make_pair("2-simd", high_resolution_clock::now() - now));
now = high_resolution_clock::now();
for (int i = 0; i < num; ++i) pattern3_c(s, d, w, h, st);
duration.emplace_back(make_pair("3-c", high_resolution_clock::now() - now));
now = high_resolution_clock::now();
for (int i = 0; i < num; ++i) pattern3_simd<ARCH>(s, d, w, h, st);
duration.emplace_back(make_pair("3-simd", high_resolution_clock::now() - now));
now = high_resolution_clock::now();
for (int i = 0; i < num; ++i) pattern0f_c(s, d, w, h, st);
duration.emplace_back(make_pair("0f-c", high_resolution_clock::now() - now));
now = high_resolution_clock::now();
for (int i = 0; i < num; ++i) pattern0f_simd(s, d, w, h, st);
duration.emplace_back(make_pair("0f-simd", high_resolution_clock::now() - now));
now = high_resolution_clock::now();
for (int i = 0; i < num; ++i) pattern3f_c(s, d, w, h, st);
duration.emplace_back(make_pair("3f-c", high_resolution_clock::now() - now));
now = high_resolution_clock::now();
for (int i = 0; i < num; ++i) pattern3f_simd(s, d, w, h, st);
duration.emplace_back(make_pair("3f-simd", high_resolution_clock::now() - now));
for (const auto& d : duration) {
std::cout << d.first << ": " << duration_cast<nanoseconds>(d.second).count() << " [nsec]\n";
}
return 0;
}
#endif // BUILD_AVS_PLUGIN
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment