Skip to content

Instantly share code, notes, and snippets.

@chikuzen
Last active December 21, 2015 16:09
Show Gist options
  • Save chikuzen/6331787 to your computer and use it in GitHub Desktop.
Save chikuzen/6331787 to your computer and use it in GitHub Desktop.
Transpose for avisynth2.6x
#include <stdint.h>
#include <malloc.h>
#include <windows.h>
#include <emmintrin.h>
#include "avisynth.h"
static const AVS_Linkage* AVS_linkage = 0;
static __forceinline __m128i mm_movehl_si128(const __m128i& xmm0)
{
return _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(xmm0),
_mm_castsi128_ps(xmm0)));
}
/* MSVC++10 does not loop unroll to my code...FUCK! */
static void __stdcall
proc_8x8(const uint8_t* srcp, uint8_t* dstp, int width, int height, int src_pitch,
int dst_pitch)
{
int hmod8 = (height / 8) * 8;
int sp = src_pitch / 16;
int dp = dst_pitch / 16;
for (int y = 0; y < hmod8; y += 8) {
__m128i* d = (__m128i*)(dstp + y);
for (int x = 0; x < width; x += 8) {
const __m128i* s = (__m128i*)(srcp + x);
__m128i xm0, xm1, xm2, xm3, xm4, xm5, xm6, xm7;
xm0 = _mm_loadl_epi64(s);
xm1 = _mm_loadl_epi64(s + sp);
xm2 = _mm_loadl_epi64(s + sp * 2);
xm3 = _mm_loadl_epi64(s + sp * 3);
xm4 = _mm_loadl_epi64(s + sp * 4);
xm5 = _mm_loadl_epi64(s + sp * 5);
xm6 = _mm_loadl_epi64(s + sp * 6);
xm7 = _mm_loadl_epi64(s + sp * 7);
xm0 = _mm_unpacklo_epi8(xm0, xm1);
xm1 = _mm_unpacklo_epi8(xm2, xm3);
xm2 = _mm_unpacklo_epi8(xm4, xm5);
xm3 = _mm_unpacklo_epi8(xm6, xm7);
xm4 = _mm_unpacklo_epi16(xm0, xm1);
xm5 = _mm_unpackhi_epi16(xm0, xm1);
xm6 = _mm_unpacklo_epi16(xm2, xm3);
xm7 = _mm_unpackhi_epi16(xm2, xm3);
xm0 = _mm_unpacklo_epi32(xm4, xm6);
xm1 = _mm_unpackhi_epi32(xm4, xm6);
xm2 = _mm_unpacklo_epi32(xm5, xm7);
xm3 = _mm_unpackhi_epi32(xm5, xm7);
xm4 = mm_movehl_si128(xm0);
xm5 = mm_movehl_si128(xm1);
xm6 = mm_movehl_si128(xm2);
xm7 = mm_movehl_si128(xm3);
_mm_storel_epi64(d, xm0);
d += dp;
_mm_storel_epi64(d, xm4);
d += dp;
_mm_storel_epi64(d, xm1);
d += dp;
_mm_storel_epi64(d, xm5);
d += dp;
_mm_storel_epi64(d, xm2);
d += dp;
_mm_storel_epi64(d, xm6);
d += dp;
_mm_storel_epi64(d, xm3);
d += dp;
_mm_storel_epi64(d, xm7);
d += dp;
}
srcp += src_pitch * 8;
}
for (int y = hmod8; y < height; y++) {
uint8_t* d = dstp + y;
for (int x = 0; x < width; x++) {
*d = srcp[x];
d += dst_pitch;
}
srcp += src_pitch;
}
}
static __forceinline void
MM_TRANSPOSE4_SI128(__m128i& x0, __m128i& x1, __m128i& x2, __m128i& x3)
{
__m128 f0 = _mm_castsi128_ps(x0);
__m128 f1 = _mm_castsi128_ps(x1);
__m128 f2 = _mm_castsi128_ps(x2);
__m128 f3 = _mm_castsi128_ps(x3);
_MM_TRANSPOSE4_PS(f0, f1, f2, f3);
x0 = _mm_castps_si128(f0);
x1 = _mm_castps_si128(f1);
x2 = _mm_castps_si128(f2);
x3 = _mm_castps_si128(f3);
}
static __forceinline void mm_uplh_epi16(__m128i& x0, __m128i& x1)
{
__m128i t = _mm_unpackhi_epi16(x0, x1);
x0 = _mm_unpacklo_epi16(x0, x1);
x1 = t;
}
static __forceinline void mm_uplh_epi8(__m128i& x0, __m128i& x1)
{
__m128i t = _mm_unpackhi_epi8(x0, x1);
x0 = _mm_unpacklo_epi8(x0, x1);
x1 = t;
}
static void __stdcall
proc_16x16(const uint8_t* srcp, uint8_t* dstp, int width, int height,
int src_pitch, int dst_pitch)
{
int hmod16 = height / 16 * 16;
int dp = dst_pitch / 16;
int sp = src_pitch / 16;
for (int y = 0; y < hmod16; y += 16) {
__m128i* d = (__m128i*)(dstp + y);
for (int x = 0; x < width; x += 16) {
const __m128i* s = (__m128i*)(srcp + x);
__m128i xm0, xm1, xm2, xm3, xm4, xm5, xm6, xm7,
xm8, xm9, xm10, xm11, xm12, xm13, xm14, xm15;
xm0 = _mm_load_si128(s);
xm1 = _mm_load_si128(s + sp);
xm2 = _mm_load_si128(s + sp * 2);
xm3 = _mm_load_si128(s + sp * 3);
xm4 = _mm_load_si128(s + sp * 4);
xm5 = _mm_load_si128(s + sp * 5);
xm6 = _mm_load_si128(s + sp * 6);
xm7 = _mm_load_si128(s + sp * 7);
xm8 = _mm_load_si128(s + sp * 8);
xm9 = _mm_load_si128(s + sp * 9);
xm10 = _mm_load_si128(s + sp * 10);
xm11 = _mm_load_si128(s + sp * 11);
xm12 = _mm_load_si128(s + sp * 12);
xm13 = _mm_load_si128(s + sp * 13);
xm14 = _mm_load_si128(s + sp * 14);
xm15 = _mm_load_si128(s + sp * 15);
mm_uplh_epi8(xm0, xm1);
mm_uplh_epi8(xm2, xm3);
mm_uplh_epi8(xm4, xm5);
mm_uplh_epi8(xm6, xm7);
mm_uplh_epi8(xm8, xm9);
mm_uplh_epi8(xm10, xm11);
mm_uplh_epi8(xm12, xm13);
mm_uplh_epi8(xm14, xm15);
mm_uplh_epi16(xm0, xm2);
mm_uplh_epi16(xm1, xm3);
mm_uplh_epi16(xm4, xm6);
mm_uplh_epi16(xm5, xm7);
mm_uplh_epi16(xm8, xm10);
mm_uplh_epi16(xm9, xm11);
mm_uplh_epi16(xm12, xm14);
mm_uplh_epi16(xm13, xm15);
MM_TRANSPOSE4_SI128(xm0, xm4, xm8, xm12);
MM_TRANSPOSE4_SI128(xm1, xm5, xm9, xm13);
MM_TRANSPOSE4_SI128(xm2, xm6, xm10, xm14);
MM_TRANSPOSE4_SI128(xm3, xm7, xm11, xm15);
_mm_store_si128(d, xm0);
d += dp;
_mm_store_si128(d, xm4);
d += dp;
_mm_store_si128(d, xm8);
d += dp;
_mm_store_si128(d, xm12);
d += dp;
_mm_store_si128(d, xm2);
d += dp;
_mm_store_si128(d, xm6);
d += dp;
_mm_store_si128(d, xm10);
d += dp;
_mm_store_si128(d, xm14);
d += dp;
_mm_store_si128(d, xm1);
d += dp;
_mm_store_si128(d, xm5);
d += dp;
_mm_store_si128(d, xm9);
d += dp;
_mm_store_si128(d, xm13);
d += dp;
_mm_store_si128(d, xm3);
d += dp;
_mm_store_si128(d, xm7);
d += dp;
_mm_store_si128(d, xm11);
d += dp;
_mm_store_si128(d, xm15);
d += dp;
}
srcp += src_pitch * 16;
}
for (int y = hmod16; y < height; y++) {
uint8_t* d = dstp + y;
for (int x = 0; x < width; x++) {
*d = srcp[x];
d += dst_pitch;
}
srcp += src_pitch;
}
}
class Transpose: public GenericVideoFilter {
int bwidth;
uint8_t *buff;
int num_planes;
void (_stdcall *proc)(const uint8_t* srcp, uint8_t* dstp, int width,
int height, int src_pitch, int dst_pitch);
public:
Transpose(PClip child, int pf, IScriptEnvironment* env);
~Transpose();
PVideoFrame __stdcall GetFrame(int n, IScriptEnvironment* env);
};
Transpose::Transpose(PClip c, int pf, IScriptEnvironment* env)
: GenericVideoFilter(c), num_planes(3)
{
if (!vi.IsPlanar() || vi.IsYV16() || vi.IsYV411()) {
env->ThrowError("Transpose: unsupported format.");
}
if (vi.IsY8()) {
num_planes = 1;
}
int t = vi.width;
vi.width = vi.height;
vi.height = t;
bwidth = ((vi.width + 15) / 16) * 16;
int bheight = ((vi.height + 15) / 16) * 16;
buff = (uint8_t *)_aligned_malloc(bwidth * bheight, 16);
if (!buff) {
env->ThrowError("Transpose: failed to allocate buffer.");
}
proc = pf == 0 ? proc_16x16 : proc_8x8;
}
Transpose::~Transpose()
{
_aligned_free(buff);
}
PVideoFrame __stdcall Transpose::GetFrame(int n, IScriptEnvironment* env)
{
const int planes[] = {PLANAR_Y, PLANAR_U, PLANAR_V};
PVideoFrame src = child->GetFrame(n, env);
PVideoFrame dst = env->NewVideoFrame(vi);
for (int i = 0; i < num_planes; i++) {
int p = planes[i];
const uint8_t* srcp = src->GetReadPtr(p);
if ((intptr_t)srcp & 15) {
env->ThrowError("Transepose: invalid memory alignment.");
}
proc(srcp, buff, src->GetRowSize(p), src->GetHeight(p),
src->GetPitch(p), bwidth);
env->BitBlt(dst->GetWritePtr(p), dst->GetPitch(p), buff, bwidth,
dst->GetRowSize(p), dst->GetHeight(p));
}
return dst;
}
static AVSValue __cdecl
create_transpose(AVSValue args, void* user_data, IScriptEnvironment* env)
{
if (!(env->GetCPUFlags() & CPUF_SSE2)) {
env->ThrowError("Transpose: requires SSE2 capable CPU.");
}
int proc = args[1].AsInt(0); // 0:16x16 1:8x8
if (proc < 0 || proc > 1) {
env->ThrowError("Transpose: proc must be set to 0 or 1.");
}
return new Transpose(args[0].AsClip(), proc, env);
}
extern "C" __declspec(dllexport) const char* __stdcall
AvisynthPluginInit3(IScriptEnvironment* env, const AVS_Linkage* const vectors)
{
AVS_linkage = vectors;
env->AddFunction("Transpose", "c[proc]i", create_transpose, 0);
return "TRANSPOSE";
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment