Skip to content

Instantly share code, notes, and snippets.

@chikuzen
Created March 15, 2016 18:11
Show Gist options
  • Save chikuzen/3c4d0bef250c6aab212f to your computer and use it in GitHub Desktop.
Save chikuzen/3c4d0bef250c6aab212f to your computer and use it in GitHub Desktop.
#define LOGO "RemoveGrain 1.0\n"
// An Avisynth plugin for removing grain from progressive video
//
// By Rainer Wittmann <[email protected]>
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// To get a copy of the GNU General Public License write to the Free Software
// Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA, or visit
// http://www.gnu.org/copyleft/gpl.html .
//#define MODIFYPLUGIN 1// creat Repair plugin instead of RemoveGrain, 0 = compatible with RemoveGrain
//#define SHARPEN 1
//#define BLUR 1
//#define SSE2_TEST // ISSE2 version that can be used side by side with the SSE version
//#define DEBUG_NAME // for debugging
//#define ISSE 2 // P4, Athlon 64, Sempron 3100
//#define ISSE 3 // Prescott P4
//#define CVERSION // for debugging only
#define ALIGNPITCH
#define SMOOTH2
#define DEFAULT_MODE 2
#define DEFAULT_RGLIMIT 0
#define VC_EXTRALEAN
#include <Windows.h>
#include <stdio.h>
#include <stdarg.h>
#include "avisynth.h"
#include "planar.h"
static IScriptEnvironment *AVSenvironment;
#ifdef SSE2_TEST
#ifndef ISSE
#define ISSE 2
#endif
#ifndef DEBUG_NAME
#define DEBUG_NAME
#endif
#endif
#ifndef ISSE
#define ISSE 1
#endif
#if ISSE > 1
#define CPUFLAGS CPUF_SSE2
#else
#define CPUFLAGS CPUF_INTEGER_SSE
#endif
#ifdef MODIFYPLUGIN
#define MAXMODE 18
#elif defined(SHARPEN)
#define MAXMODE 22
#define MAXSTRENGTH 2
#define DEFAULT_STRENGTH 1
#else
#define MAXMODE 28
#endif
#if defined(SHARPEN) && defined(MODIFYPLUGIN)
#error "SHARPEN cannot be combined with MODIFYPLUGIN"
#endif
#if defined(BLUR) && defined(MODIFYPLUGIN)
#error "SHARPEN cannot be combined with MODIFYPLUGIN"
#endif
#if 1
void debug_printf(const char *format, ...)
{
char buffer[200];
va_list args;
va_start(args, format);
vsprintf(buffer, format, args);
va_end(args);
OutputDebugString(buffer);
}
#endif
#define COMPARE_MASK (~24)
static void CompareVideoInfo(VideoInfo &vi1, const VideoInfo &vi2, const char *progname)
{
if( (vi1.width != vi2.width) || (vi1.height != vi2.height) || ( (vi1.pixel_type & COMPARE_MASK) != (vi2.pixel_type & COMPARE_MASK) ))
{
#if 1
debug_printf("widths = %u, %u, heights = %u, %u, color spaces = %X, %X\n"
, vi1.width, vi2.width, vi1.height, vi2.height, vi1.pixel_type, vi2.pixel_type);
#endif
AVSenvironment->ThrowError("%s: clips must be of equal type", progname);
}
if(vi1.num_frames > vi2.num_frames) vi1.num_frames = vi2.num_frames;
}
#ifdef TESTCOMPARE
unsigned testcompare(const BYTE *dp, int dpitch, const BYTE *pp, int ppitch, int width, int height)
{
int i = height;
--dp; --pp;
unsigned diffsum = 0;
do
{
int j = width;
do
{
int diff = dp[j] - pp[j];
if( diff < 0 ) diff = -diff;
diffsum += diff;
} while( --j );
dp += dpitch;
pp += ppitch;
} while( --i );
return diffsum;
}
#define xpitch 1
void RemoveGrain(BYTE *dp, int dpitch, const BYTE *sp, int spitch, int width, int height, int threshold)
{
int sinc = - (width + 1) * xpitch;
dpitch += sinc;
sinc += spitch;
do
{
dp[0] = sp[0];
dp += xpitch; sp += xpitch;
int i = width;
do
{
unsigned sort1[8];
int leq = 0;
int geq = 0;
unsigned x = sp[0];
if( (sort1[0] = (sp += xpitch)[0]) <= x )
{
if( sort1[0] == x ) ++geq;
++leq;
}
if( (sort1[1] = (sp += spitch)[0]) <= x )
{
if( sort1[1] == x ) ++geq;
++leq;
}
if( (sort1[2] = (sp -= xpitch)[0]) <= x )
{
if( sort1[2] == x ) ++geq;
++leq;
}
if( (sort1[3] = (sp -= xpitch)[0]) <= x )
{
if( sort1[3] >= x ) ++geq;
++leq;
}
if( (sort1[4] = (sp -= spitch)[0]) <= x )
{
if( sort1[4] >= x ) ++geq;
++leq;
}
if( (sort1[5] = (sp -= spitch)[0]) <= x )
{
if( sort1[5] >= x ) ++geq;
++leq;
}
if( (sort1[6] = (sp += xpitch)[0]) <= x )
{
if( sort1[6] >= x ) ++geq;
++leq;
}
if( (sort1[7] = (sp += xpitch)[0]) <= x )
{
if( sort1[7] >= x ) ++geq;
++leq;
}
if( ((geq += 8 - leq) < threshold) || (leq < threshold) )
{ // do a merge sort of sort1[8] as fast as possible
unsigned sort2[8];
if( sort1[1] < sort1[0] )
{
sort2[0] = sort1[1];
sort2[1] = sort1[0];
}
else
{
sort2[0] = sort1[0];
sort2[1] = sort1[1];
}
if( sort1[3] < sort1[2] )
{
sort2[2] = sort1[3];
sort2[3] = sort1[2];
}
else
{
sort2[2] = sort1[2];
sort2[3] = sort1[3];
}
if( sort1[5] < sort1[4] )
{
sort2[4] = sort1[5];
sort2[5] = sort1[4];
}
else
{
sort2[4] = sort1[4];
sort2[5] = sort1[5];
}
if( sort1[7] < sort1[6] )
{
sort2[6] = sort1[7];
sort2[7] = sort1[6];
}
else
{
sort2[6] = sort1[6];
sort2[7] = sort1[7];
}
if( sort2[0] > sort2[2] )
{
sort1[0] = sort2[2];
if( sort2[3] <= sort2[0] )
{
sort1[1] = sort2[3];
sort1[2] = sort2[0];
sort1[3] = sort2[1];
}
else
{
sort1[1] = sort2[0];
if( sort2[1] < sort2[3] )
{
sort1[2] = sort2[1];
sort1[3] = sort2[3];
}
else
{
sort1[2] = sort2[3];
sort1[3] = sort2[1];
}
}
}
else
{
sort1[0] = sort2[0];
if( sort2[1] <= sort2[2] )
{
sort1[1] = sort2[1];
sort1[2] = sort2[2];
sort1[3] = sort2[3];
}
else
{
sort1[1] = sort2[2];
if( sort2[3] < sort2[1] )
{
sort1[2] = sort2[3];
sort1[3] = sort2[1];
}
else
{
sort1[2] = sort2[1];
sort1[3] = sort2[3];
}
}
}
#if 0
if( (sort1[0] > sort1[1]) || (sort1[1] > sort1[2]) || (sort1[2] > sort1[3]) )
debug_printf("merge error: sort = %u, %u, %u, %u\n", sort1[0], sort1[1], sort1[2], sort1[3]);
#endif
if( sort2[4] > sort2[6] )
{
sort1[4] = sort2[6];
if( sort2[7] <= sort2[4] )
{
sort1[5] = sort2[7];
sort1[6] = sort2[4];
sort1[7] = sort2[5];
}
else
{
sort1[5] = sort2[4];
if( sort2[5] < sort2[7] )
{
sort1[6] = sort2[5];
sort1[7] = sort2[7];
}
else
{
sort1[6] = sort2[7];
sort1[7] = sort2[5];
}
}
}
else
{
sort1[4] = sort2[4];
if( sort2[5] <= sort2[6] )
{
sort1[5] = sort2[5];
sort1[6] = sort2[6];
sort1[7] = sort2[7];
}
else
{
sort1[5] = sort2[6];
if( sort2[7] < sort2[5] )
{
sort1[6] = sort2[7];
sort1[7] = sort2[5];
}
else
{
sort1[6] = sort2[5];
sort1[7] = sort2[7];
}
}
}
#if 0
if( (sort1[4] > sort1[5]) || (sort1[5] > sort1[6]) || (sort1[6] > sort1[7]) )
debug_printf("merge error: sort = %u, %u, %u, %u\n", sort1[4], sort1[5], sort1[6], sort1[7]);
#endif
unsigned *s1 = sort1, *s2 = sort1 + 4, *t = sort2;
*t++ = *s1 > *s2 ? *s2++ : *s1++;
*t++ = *s1 > *s2 ? *s2++ : *s1++;
*t++ = *s1 > *s2 ? *s2++ : *s1++;
if( sort1[3] > sort1[7] )
{
do
{
*t++ = *s1 > *s2 ? *s2++ : *s1++;
} while( s2 != sort1 + 8 );
do
{
*t++ = *s1++;
} while( s1 != sort1 + 4 );
}
else
{
do
{
*t++ = *s1 > *s2 ? *s2++ : *s1++;
} while( s1 != sort1 + 4 );
do
{
*t++ = *s2++;
} while( s2 != sort1 + 8 );
}
#if 0
if( (leq > 0) && (sort2[leq - 1] > x) ) debug_printf("leq = %u, x = %u, sort = %u,%u,%u,%u,%u,%u,%u,%u\n", leq, x, sort2[0], sort2[1], sort2[2], sort2[3], sort2[4], sort2[5], sort2[6], sort2[7]);
if( (leq < 8) && (sort2[leq] <= x) ) debug_printf("leq = %u, x = %u, sort = %u,%u,%u,%u,%u,%u,%u,%u\n", leq, x, sort2[0], sort2[1], sort2[2], sort2[3], sort2[4], sort2[5], sort2[6], sort2[7]);
if( (geq > 0) && (sort2[8 - geq] < x) ) debug_printf("geq = %u, x = %u, sort = %u,%u,%u,%u,%u,%u,%u,%u\n", geq, x, sort2[0], sort2[1], sort2[2], sort2[3], sort2[4], sort2[5], sort2[6], sort2[7]);
if( (geq < 8) && (sort2[7 - geq] >= x) ) debug_printf("geq = %u, x = %u, sort = %u,%u,%u,%u,%u,%u,%u,%u\n", geq, x, sort2[0], sort2[1], sort2[2], sort2[3], sort2[4], sort2[5], sort2[6], sort2[7]);
#endif
x = leq < threshold ? sort2[threshold - 1] : sort2[8 - threshold];
}
dp[0] = x;
dp += xpitch;
sp += spitch;
} while( --i );
dp[0] = sp[0];
dp += dpitch; sp += sinc;
} while( --height );
}
#undef xpitch
#endif // TESTCOMPARE
#if ISSE > 1
#define SSE_INCREMENT 16
#define SSE_SHIFT 4
#define SSE_MOVE movdqu
#if ISSE > 2
#define SSE3_MOVE lddqu
#else
#define SSE3_MOVE movdqu
#endif
#define SSE_RMOVE movdqa
#define SSE0 xmm0
#define SSE1 xmm1
#define SSE2 xmm2
#define SSE3 xmm3
#define SSE4 xmm4
#define SSE5 xmm5
#define SSE6 xmm6
#define SSE7 xmm7
#define SSE_EMMS
#else
#define SSE_INCREMENT 8
#define SSE_SHIFT 3
#define SSE_MOVE movq
#define SSE3_MOVE movq
#define SSE_RMOVE movq
#define SSE0 mm0
#define SSE1 mm1
#define SSE2 mm2
#define SSE3 mm3
#define SSE4 mm4
#define SSE5 mm5
#define SSE6 mm6
#define SSE7 mm7
#define SSE_EMMS __asm emms
#endif // ISSE
#if defined(SHARPEN) || defined(BLUR)
#define SHLUR
#endif
#if BLUR == 1
#define blur(center, min, max, reg1, reg2)\
__asm SSE_RMOVE reg2, center \
__asm psubusb max, center \
__asm psubusb reg2, min \
__asm SSE_RMOVE reg1, max \
__asm SSE_RMOVE min, reg2 \
__asm psubusb max, reg2 \
__asm psubusb min, reg1 \
__asm psrlw max, 1 \
__asm psrlw min, 1 \
__asm pminub reg2, max \
__asm pminub reg1, min \
__asm paddusb center, reg2 \
__asm psubusb center, reg1
#elif BLUR == 2
__asm pminub center, max \
__asm pmaxub center, min \
__asm SSE_RMOVE reg2, center \
__asm psubusb max, center \
__asm psubusb reg2, min \
__asm SSE_RMOVE reg1, max \
__asm SSE_RMOVE min, reg2 \
__asm psubusb max, reg2 \
__asm psubusb min, reg1 \
__asm psrlw max, 1 \
__asm psrlw min, 1 \
__asm pminub reg2, max \
__asm pminub reg1, min \
__asm paddusb center, reg2 \
__asm psubusb center, reg1
#endif
#ifdef SHARPEN
static const __declspec(align(SSE_INCREMENT)) unsigned short rshift[3][SSE_INCREMENT / 2] =
{
{
0,0,0,0
#if SSE_INCREMENT == 16
,0,0,0,0
#endif
},
{
1,0,0,0
#if SSE_INCREMENT == 16
, 0,0,0,0
#endif
},
{
2,0,0,0
#if SSE_INCREMENT == 16
, 0,0,0,0
#endif
}
};
#define SHIFT_MASK0 255
#define SHIFT_MASK1 127
#define SHIFT_MASK2 63
static const __declspec(align(SSE_INCREMENT)) BYTE shift_mask[3][SSE_INCREMENT] =
{
{
SHIFT_MASK0, SHIFT_MASK0, SHIFT_MASK0, SHIFT_MASK0, SHIFT_MASK0, SHIFT_MASK0, SHIFT_MASK0, SHIFT_MASK0
#if SSE_INCREMENT == 16
, SHIFT_MASK0, SHIFT_MASK0, SHIFT_MASK0, SHIFT_MASK0, SHIFT_MASK0, SHIFT_MASK0, SHIFT_MASK0, SHIFT_MASK0
#endif
},
{
SHIFT_MASK1, SHIFT_MASK1, SHIFT_MASK1, SHIFT_MASK1, SHIFT_MASK1, SHIFT_MASK1, SHIFT_MASK1, SHIFT_MASK1
#if SSE_INCREMENT == 16
, SHIFT_MASK1, SHIFT_MASK1, SHIFT_MASK1, SHIFT_MASK1, SHIFT_MASK1, SHIFT_MASK1, SHIFT_MASK1, SHIFT_MASK1
#endif
},
{
SHIFT_MASK2, SHIFT_MASK2, SHIFT_MASK2, SHIFT_MASK2, SHIFT_MASK2, SHIFT_MASK2, SHIFT_MASK2, SHIFT_MASK2
#if SSE_INCREMENT == 16
, SHIFT_MASK2, SHIFT_MASK2, SHIFT_MASK2, SHIFT_MASK2, SHIFT_MASK2, SHIFT_MASK2, SHIFT_MASK2, SHIFT_MASK2
#endif
}
};
#if SHARPEN == 1
// only sharpen
#define sharpen(center, min, max, rshift, SHIFT_MASK1, reg1, reg2)\
__asm SSE_RMOVE reg2, center \
__asm psubusb max, center \
__asm psubusb reg2, min \
__asm SSE_RMOVE reg1, max \
__asm SSE_RMOVE min, reg2 \
__asm psubusb max, reg2 \
__asm psubusb min, reg1 \
__asm psrlw reg2, rshift \
__asm psrlw reg1, rshift \
__asm pand reg2, SHIFT_MASK1 \
__asm pand reg1, SHIFT_MASK1 \
__asm pminub reg2, max \
__asm pminub reg1, min \
__asm psubusb center, reg2 \
__asm paddusb center, reg1
#elif SHARPEN == 2
// clip and sharpen
#define sharpen(center, min, max, rshift, SHIFT_MASK1, reg1, reg2)\
__asm pminub center, max \
__asm pmaxub center, min \
__asm SSE_RMOVE reg2, center \
__asm psubusb max, center \
__asm psubusb reg2, min \
__asm SSE_RMOVE reg1, max \
__asm SSE_RMOVE min, reg2 \
__asm psubusb max, reg2 \
__asm psubusb min, reg1 \
__asm psrlw reg2, rshift \
__asm psrlw reg1, rshift \
__asm pand reg2, SHIFT_MASK1 \
__asm pand reg1, SHIFT_MASK1 \
__asm pminub reg2, max \
__asm pminub reg1, min \
__asm psubusb center, reg2 \
__asm paddusb center, reg1
#endif
#endif // SHARPEN
#ifdef BLUR
#define sharpen(center, min, max, rshift, SHIFT_MASK1, reg1, reg2) blur(center, min, max, reg1, reg2)
#endif
#ifdef SHARPEN
void do_nothing(BYTE *dp, int dpitch, const BYTE *sp, int spitch, int hblocks, int remainder, int incpitch, int height, int strength)
{
}
void copy_plane(BYTE *dp, int dpitch, const BYTE *sp, int spitch, int hblocks, int remainder, int incpitch, int height, int strength)
{
AVSenvironment->BitBlt(dp, dpitch, sp, spitch, hblocks * SSE_INCREMENT + 2 * (SSE_INCREMENT + 1) + remainder, height);
}
#else // SHARPEN
void do_nothing(BYTE *dp, int dpitch, const BYTE *sp, int spitch, int hblocks, int remainder, int incpitch, int height)
{
}
void copy_plane(BYTE *dp, int dpitch, const BYTE *sp, int spitch, int hblocks, int remainder, int incpitch, int height)
{
AVSenvironment->BitBlt(dp, dpitch, sp, spitch, hblocks * SSE_INCREMENT + 2 * (SSE_INCREMENT + 1) + remainder, height);
}
#endif // SHARPEN
#define ins2(first, second, reg) \
__asm pmaxub second, reg \
__asm pminub second, first \
__asm pmaxub first, reg
#define ins3(first, second, third, reg) \
__asm pmaxub third, reg \
__asm pminub third, second \
ins2(first, second, reg)
#define ins4(first, second, third, fourth, reg) \
__asm pmaxub fourth, reg \
__asm pminub fourth, third \
ins3(first, second, third, reg)
#define ins5(first, second, third, fourth, fifth, reg) \
__asm pmaxub fifth, reg \
__asm pminub fifth, fourth \
ins4(first, second, third, fourth, reg)
#define ins6(first, second, third, fourth, fifth, sixth, reg) \
__asm pmaxub sixth, reg \
__asm pminub sixth, fifth \
ins5(first, second, third, fourth, fifth, reg)
#define add2(first, second, reg) \
__asm SSE_RMOVE second, reg \
__asm pminub second, first \
__asm pmaxub first, reg
#define add3(first, second, third, reg) \
__asm SSE_RMOVE third, reg \
__asm pminub third, second \
ins2(first, second, reg)
#define add4(first, second, third, fourth, reg) \
__asm SSE_RMOVE fourth, reg \
__asm pminub fourth, third \
ins3(first, second, third, reg)
#define add5(first, second, third, fourth, fifth, reg) \
__asm SSE_RMOVE fifth, reg \
__asm pminub fifth, fourth \
ins4(first, second, third, fourth, reg)
#define add6(first, second, third, fourth, fifth, sixth, reg) \
__asm SSE_RMOVE sixth, reg \
__asm pminub sixth, fifth \
ins5(first, second, third, fourth, fifth, reg)
#define sub2(first, second, val) \
__asm pmaxub second, val \
__asm pminub second, first
#define sub3(first, second, third, reg) \
__asm pmaxub third, reg \
__asm pminub third, second \
sub2(first, second, reg)
#define sub4(first, second, third, fourth, reg) \
__asm pmaxub fourth, reg \
__asm pminub fourth, third \
sub3(first, second, third, reg)
#define sub5(first, second, third, fourth, fifth, reg) \
__asm pmaxub fifth, reg \
__asm pminub fifth, fourth \
sub4(first, second, third, fourth, reg)
#define sub6(first, second, third, fourth, fifth, sixth, reg) \
__asm pmaxub sixth, reg \
__asm pminub sixth, fifth \
sub5(first, second, third, fourth, fifth, reg)
#define minmax1(min, max, val) \
__asm pminub min, val \
__asm pmaxub max, val
#define minmax2(max1, max2, min2, min1, reg) \
__asm pminub min2, reg \
__asm pmaxub max2, reg \
__asm pmaxub min2, min1 \
__asm pminub max2, max1 \
__asm pminub min1, reg \
__asm pmaxub max1, reg
#define minmax3(max1, max2, max3, min3, min2, min1, reg)\
__asm pminub min3, reg \
__asm pmaxub max3, reg \
__asm pmaxub min3, min2 \
__asm pminub max3, max2 \
minmax2(max1, max2, min2, min1, reg)
#define minmax2sub(max1, max2, min2, min1, val) \
__asm pminub min2, val \
__asm pmaxub max2, val \
__asm pmaxub min2, min1 \
__asm pminub max2, max1
#define minmax3sub(max1, max2, max3, min3, min2, min1, reg)\
__asm pminub min3, reg \
__asm pmaxub max3, reg \
__asm pmaxub min3, min2 \
__asm pminub max3, max2 \
minmax2sub(max1, max2, min2, min1, reg)
#ifdef SHARPEN
void SSE_RemoveGrain4(BYTE *dp, int dpitch, const BYTE *_sp, int spitch, int hblocks, int remainder, int incpitch, int height, int strength)
#else
void SSE_RemoveGrain4(BYTE *dp, int dpitch, const BYTE *_sp, int spitch, int hblocks, int remainder, int incpitch, int height)
#endif
{
__asm mov eax, hblocks
__asm mov ebx, spitch
#ifdef MODIFYPLUGIN
__asm mov ecx, eax
#endif
__asm mov edx, remainder
#if SSE_INCREMENT == 16
__asm add eax, eax
#endif
__asm mov esi, _sp
#ifdef MODIFYPLUGIN
__asm lea eax, [eax * 8 + edx]
#else
__asm lea eax, [eax * 8 + edx + SSE_INCREMENT + 1]
#endif
__asm sub esi, ebx
__asm sub dpitch, eax
__asm neg eax
__asm mov edi, dp
#ifdef MODIFYPLUGIN
__asm inc edi
__asm lea eax, [ebx + eax]
#else
__asm lea eax, [ebx + eax + 1]
#ifdef SHARPEN
__asm mov spitch, eax
__asm mov eax, strength
#endif
__asm align 16
__asm column_loop:
__asm SSE3_MOVE SSE0, [esi]
__asm SSE3_MOVE SSE7, [esi + 1]
add2(SSE0, SSE1, SSE7)
__asm SSE3_MOVE SSE6, [esi + 2]
__asm SSE3_MOVE SSE5, [esi + ebx]
add3(SSE0, SSE1, SSE2, SSE6)
__asm SSE3_MOVE SSE7, [esi + ebx + 2]
add4(SSE0, SSE1, SSE2, SSE3, SSE5)
__asm movd [edi], SSE5
__asm SSE3_MOVE SSE6, [esi + 2*ebx]
add5(SSE0, SSE1, SSE2, SSE3, SSE4, SSE7)
__asm SSE3_MOVE SSE5, [esi + 2*ebx + 1]
sub5(SSE0, SSE1, SSE2, SSE3, SSE4, SSE6)
__asm SSE3_MOVE SSE7, [esi + 2*ebx + 2]
sub4(SSE1, SSE2, SSE3, SSE4, SSE5)
#if (ISSE > 1) || defined(SHLUR)
__asm SSE3_MOVE SSE5, [esi + ebx + 1]
#endif
sub3(SSE2, SSE3, SSE4, SSE7)
#ifdef SHLUR
sharpen(SSE5, SSE4, SSE3, rshift[eax], shift_mask[eax], SSE0, SSE1)
__asm SSE_MOVE [edi + 1], SSE5
#else // SHLUR
#if ISSE > 1
__asm pmaxub SSE4, SSE5
#else
__asm pmaxub SSE4, [esi + ebx + 1]
#endif
__asm pminub SSE3, SSE4
__asm SSE_MOVE [edi + 1], SSE3
#endif // SHLUR
// now the pixels in the middle
__asm add esi, SSE_INCREMENT
__asm add edi, SSE_INCREMENT + 1
__asm mov ecx, hblocks
#endif // MODIFYPLUGIN
__asm align 16
__asm middle_loop:
__asm SSE3_MOVE SSE0, [esi]
__asm SSE3_MOVE SSE7, [esi + 1]
add2(SSE0, SSE1, SSE7)
__asm SSE3_MOVE SSE6, [esi + 2]
__asm SSE3_MOVE SSE5, [esi + ebx]
add3(SSE0, SSE1, SSE2, SSE6)
__asm SSE3_MOVE SSE7, [esi + ebx + 2]
add4(SSE0, SSE1, SSE2, SSE3, SSE5)
#if MODIFYPLUGIN == 1
__asm SSE3_MOVE SSE6, [esi + ebx + 1]
add5(SSE0, SSE1, SSE2, SSE3, SSE4, SSE7)
__asm SSE3_MOVE SSE7, [esi + 2*ebx]
add6(SSE0, SSE1, SSE2, SSE3, SSE4, SSE5, SSE6)
__asm SSE3_MOVE SSE6, [esi + 2*ebx + 1]
sub6(SSE0, SSE1, SSE2, SSE3, SSE4, SSE5, SSE7)
__asm SSE3_MOVE SSE7, [esi + 2*ebx + 2]
sub5(SSE1, SSE2, SSE3, SSE4, SSE5, SSE6)
#if ISSE > 1
__asm SSE3_MOVE SSE0, [edi]
#endif
sub4(SSE2, SSE3, SSE4, SSE5, SSE7)
#if ISSE > 1
__asm pmaxub SSE5, SSE0
#else
__asm pmaxub SSE5, [edi]
#endif
__asm add esi, SSE_INCREMENT
__asm pminub SSE3, SSE5
__asm SSE_MOVE [edi], SSE3
#else // MODIFYPLUGIN == 1
__asm SSE3_MOVE SSE6, [esi + 2*ebx]
add5(SSE0, SSE1, SSE2, SSE3, SSE4, SSE7)
__asm SSE3_MOVE SSE5, [esi + 2*ebx + 1]
sub5(SSE0, SSE1, SSE2, SSE3, SSE4, SSE6)
__asm SSE3_MOVE SSE7, [esi + 2*ebx + 2]
sub4(SSE1, SSE2, SSE3, SSE4, SSE5)
#if (ISSE > 1) || defined(SHLUR)
#ifdef MODIFYPLUGIN
__asm SSE3_MOVE SSE5, [edi]
#else
__asm SSE3_MOVE SSE5, [esi + ebx + 1]
#endif
#endif
sub3(SSE2, SSE3, SSE4, SSE7)
#ifdef SHLUR
sharpen(SSE5, SSE4, SSE3, rshift[eax], shift_mask[eax], SSE0, SSE1)
__asm add esi, SSE_INCREMENT
__asm SSE_MOVE [edi], SSE5
#else
#if ISSE > 1
__asm pmaxub SSE4, SSE5
#else
#ifdef MODIFYPLUGIN
__asm pmaxub SSE4, [edi]
#else // ISSE > 1
__asm pmaxub SSE4, [esi + ebx + 1]
#endif
#endif // ISSE > 1
__asm add esi, SSE_INCREMENT
__asm pminub SSE3, SSE4
__asm SSE_MOVE [edi], SSE3
#endif // SHLUR
#endif //MODIFYPLUGIN == 1
__asm add edi, SSE_INCREMENT
__asm dec ecx
__asm jnz middle_loop
// the last pixels
__asm add esi, edx
__asm add edi, edx
__asm SSE3_MOVE SSE0, [esi]
__asm SSE3_MOVE SSE7, [esi + 1]
add2(SSE0, SSE1, SSE7)
__asm SSE3_MOVE SSE6, [esi + 2]
__asm SSE3_MOVE SSE5, [esi + ebx]
add3(SSE0, SSE1, SSE2, SSE6)
__asm SSE3_MOVE SSE7, [esi + ebx + 2]
add4(SSE0, SSE1, SSE2, SSE3, SSE5)
#ifndef MODIFYPLUGIN
__asm SSE_MOVE [edi + 1], SSE7
#endif
#if MODIFYPLUGIN == 1
__asm SSE3_MOVE SSE6, [esi + ebx + 1]
add5(SSE0, SSE1, SSE2, SSE3, SSE4, SSE7)
__asm SSE3_MOVE SSE7, [esi + 2*ebx]
add6(SSE0, SSE1, SSE2, SSE3, SSE4, SSE5, SSE6)
__asm SSE3_MOVE SSE6, [esi + 2*ebx + 1]
sub6(SSE0, SSE1, SSE2, SSE3, SSE4, SSE5, SSE7)
__asm SSE3_MOVE SSE7, [esi + 2*ebx + 2]
sub5(SSE1, SSE2, SSE3, SSE4, SSE5, SSE6)
#if ISSE > 1
__asm SSE3_MOVE SSE0, [edi]
#endif
sub4(SSE2, SSE3, SSE4, SSE5, SSE7)
#if ISSE > 1
__asm pmaxub SSE5, SSE0
#else
__asm pmaxub SSE5, [edi]
#endif
__asm pminub SSE3, SSE5
__asm add esi, eax
__asm SSE_MOVE [edi], SSE3
#else // MODIFYPLUGIN == 1
__asm SSE3_MOVE SSE6, [esi + 2*ebx]
add5(SSE0, SSE1, SSE2, SSE3, SSE4, SSE7)
__asm SSE3_MOVE SSE5, [esi + 2*ebx + 1]
sub5(SSE0, SSE1, SSE2, SSE3, SSE4, SSE6)
__asm SSE3_MOVE SSE7, [esi + 2*ebx + 2]
sub4(SSE1, SSE2, SSE3, SSE4, SSE5)
#if (ISSE > 1) || defined(SHLUR)
#ifdef MODIFYPLUGIN
__asm SSE3_MOVE SSE5, [edi]
#else
__asm SSE3_MOVE SSE5, [esi + ebx + 1]
#endif
#endif
sub3(SSE2, SSE3, SSE4, SSE7)
#ifdef SHLUR
sharpen(SSE5, SSE4, SSE3, rshift[eax], shift_mask[eax], SSE0, SSE1)
#ifdef SHARPEN
__asm add esi, spitch
#else
__asm add esi, eax
#endif
__asm SSE_MOVE [edi], SSE5
#else
#if ISSE > 1
__asm pmaxub SSE4, SSE5
#else
#ifdef MODIFYPLUGIN
__asm pmaxub SSE4, [edi]
#else // ISSE > 1
__asm pmaxub SSE4, [esi + ebx + 1]
#endif
#endif // ISSE > 1
__asm pminub SSE3, SSE4
__asm add esi, eax
__asm SSE_MOVE [edi], SSE3
#endif // SHLUR
#endif //MODIFYPLUGIN == 1
__asm add edi, dpitch
__asm dec height
#ifdef MODIFYPLUGIN
__asm mov ecx, hblocks
__asm jnz middle_loop
#else
__asm jnz column_loop
#endif
}
#ifdef SHARPEN
void SSE_RemoveGrain1(BYTE *dp, int dpitch, const BYTE *_sp, int spitch, int hblocks, int remainder, int incpitch, int height, int strength)
#else
void SSE_RemoveGrain1(BYTE *dp, int dpitch, const BYTE *_sp, int spitch, int hblocks, int remainder, int incpitch, int height)
#endif
{
#ifdef SHARPEN
__asm mov ebx, strength
__asm SSE_RMOVE SSE2, rshift[ebx]
__asm SSE_RMOVE SSE3, shift_mask[ebx]
#endif
__asm mov eax, hblocks
__asm mov ebx, spitch
#ifdef MODIFYPLUGIN
__asm mov ecx, eax
#endif
__asm mov edx, remainder
#if SSE_INCREMENT == 16
__asm add eax, eax
#endif
__asm mov esi, _sp
#ifdef MODIFYPLUGIN
__asm lea eax, [eax * 8 + edx]
#else
__asm lea eax, [eax * 8 + edx + SSE_INCREMENT + 1]
#endif
__asm sub esi, ebx
__asm sub dpitch, eax
__asm neg eax
__asm mov edi, dp
#ifdef MODIFYPLUGIN
__asm inc edi
__asm lea eax, [ebx + eax]
#else
__asm lea eax, [ebx + eax + 1]
__asm align 16
__asm column_loop:
__asm SSE3_MOVE SSE0, [esi]
__asm SSE3_MOVE SSE5, [esi + 1]
__asm SSE_RMOVE SSE1, SSE0
__asm SSE3_MOVE SSE4, [esi + 2]
minmax1(SSE0, SSE1, SSE5)
__asm SSE3_MOVE SSE5, [esi + 2*ebx]
minmax1(SSE0, SSE1, SSE4)
__asm SSE3_MOVE SSE4, [esi + 2*ebx + 1]
minmax1(SSE0, SSE1, SSE5)
__asm SSE3_MOVE SSE5, [esi + 2*ebx + 2]
minmax1(SSE0, SSE1, SSE4)
__asm SSE3_MOVE SSE4, [esi + ebx]
minmax1(SSE0, SSE1, SSE5)
__asm SSE3_MOVE SSE5, [esi + ebx + 2]
minmax1(SSE0, SSE1, SSE4)
#if (ISSE > 1) || defined(SHLUR)
__asm SSE3_MOVE SSE7, [esi + ebx + 1]
#endif
__asm movd [edi], SSE4 // only for saving the first byte
minmax1(SSE0, SSE1, SSE5)
#ifdef SHLUR
sharpen(SSE7, SSE0, SSE1, SSE2, SSE3, SSE5, SSE4)
__asm SSE_MOVE [edi + 1], SSE7
#else
#if ISSE > 1
__asm pmaxub SSE0, SSE7
#else
__asm pmaxub SSE0, [esi + ebx + 1]
#endif
__asm pminub SSE0, SSE1
__asm SSE_MOVE [edi + 1], SSE0
#endif // SHLUR
// now the pixels in the middle
__asm add esi, SSE_INCREMENT
__asm add edi, SSE_INCREMENT + 1
__asm mov ecx, hblocks
#endif // MODIFYPLUGIN
__asm align 16
__asm middle_loop:
__asm SSE3_MOVE SSE0, [esi]
__asm SSE3_MOVE SSE5, [esi + 1]
__asm SSE_RMOVE SSE1, SSE0
__asm SSE3_MOVE SSE4, [esi + 2]
minmax1(SSE0, SSE1, SSE5)
__asm SSE3_MOVE SSE6, [esi + 2*ebx]
minmax1(SSE0, SSE1, SSE4)
__asm SSE3_MOVE SSE5, [esi + 2*ebx + 1]
minmax1(SSE0, SSE1, SSE6)
__asm SSE3_MOVE SSE4, [esi + 2*ebx + 2]
minmax1(SSE0, SSE1, SSE5)
__asm SSE3_MOVE SSE6, [esi + ebx]
minmax1(SSE0, SSE1, SSE4)
#if MODIFYPLUGIN == 1
__asm SSE3_MOVE SSE5, [esi + ebx + 1]
minmax1(SSE0, SSE1, SSE6)
__asm SSE3_MOVE SSE4, [esi + ebx + 2]
minmax1(SSE0, SSE1, SSE5)
#if ISSE > 1
__asm SSE3_MOVE SSE7, [edi]
#endif
minmax1(SSE0, SSE1, SSE4)
#else // MODIFYPLUGIN
__asm SSE3_MOVE SSE5, [esi + ebx + 2]
minmax1(SSE0, SSE1, SSE6)
#if (ISSE > 1) || defined(SHLUR)
__asm SSE3_MOVE SSE7, [esi + ebx + 1]
#endif
minmax1(SSE0, SSE1, SSE5)
#endif // MODIFYPLUGIN
#ifdef SHLUR
sharpen(SSE7, SSE0, SSE1, SSE2, SSE3, SSE5, SSE4)
__asm add esi, SSE_INCREMENT
__asm SSE_MOVE [edi], SSE7
#else
#if ISSE > 1
__asm pmaxub SSE0, SSE7
#else
#ifdef MODIFYPLUGIN
__asm pmaxub SSE0, [edi]
#else
__asm pmaxub SSE0, [esi + ebx + 1]
#endif
#endif // ISSE > 1
__asm pminub SSE0, SSE1
__asm add esi, SSE_INCREMENT
__asm SSE_MOVE [edi], SSE0
#endif // SHLUR
__asm add edi, SSE_INCREMENT
#if ((MODIFYPLUGIN == 1) && (ISSE > 1)) || defined(SHLUR)
__asm dec ecx
__asm jnz middle_loop
#else
__asm loop middle_loop
#endif
// the last pixels
__asm add esi, edx
__asm add edi, edx
__asm SSE3_MOVE SSE0, [esi]
__asm SSE3_MOVE SSE5, [esi + 1]
__asm SSE_RMOVE SSE1, SSE0
__asm SSE3_MOVE SSE4, [esi + 2]
minmax1(SSE0, SSE1, SSE5)
__asm SSE3_MOVE SSE6, [esi + 2*ebx]
minmax1(SSE0, SSE1, SSE4)
__asm SSE3_MOVE SSE5, [esi + 2*ebx + 1]
minmax1(SSE0, SSE1, SSE6)
__asm SSE3_MOVE SSE4, [esi + 2*ebx + 2]
minmax1(SSE0, SSE1, SSE5)
__asm SSE3_MOVE SSE6, [esi + ebx]
minmax1(SSE0, SSE1, SSE4)
#if MODIFYPLUGIN == 1
__asm SSE3_MOVE SSE5, [esi + ebx + 1]
minmax1(SSE0, SSE1, SSE6)
__asm SSE3_MOVE SSE4, [esi + ebx + 2]
minmax1(SSE0, SSE1, SSE5)
#if ISSE > 1
__asm SSE3_MOVE SSE7, [edi]
#endif
minmax1(SSE0, SSE1, SSE4)
#else // MODIFYPLUGIN == 1
__asm SSE3_MOVE SSE5, [esi + ebx + 2]
minmax1(SSE0, SSE1, SSE6)
#if (ISSE > 1) || defined(SHLUR)
__asm SSE3_MOVE SSE7, [esi + ebx + 1]
#endif
#ifndef MODIFYPLUGIN
__asm SSE_MOVE [edi + 1], SSE5 // only for saving the last byte
#endif
minmax1(SSE0, SSE1, SSE5)
#endif // MODIFYPLUGIN == 1
#ifdef SHLUR
sharpen(SSE7, SSE0, SSE1, SSE2, SSE3, SSE5, SSE4)
__asm add esi, eax
__asm SSE_MOVE [edi], SSE7
#else
#if ISSE > 1
__asm pmaxub SSE0, SSE7
#else
#ifdef MODIFYPLUGIN
__asm pmaxub SSE0, [edi]
#else
__asm pmaxub SSE0, [esi + ebx + 1]
#endif
#endif // ISSE > 1
__asm pminub SSE0, SSE1
__asm add esi, eax
__asm SSE_MOVE [edi], SSE0
#endif // SHLUR
__asm add edi, dpitch
__asm dec height
#ifdef MODIFYPLUGIN
__asm mov ecx, hblocks
__asm jnz middle_loop
#else
__asm jnz column_loop
#endif
}
#ifdef SHARPEN
void SSE_RemoveGrain2(BYTE *dp, int dpitch, const BYTE *_sp, int spitch, int hblocks, int remainder, int incpitch, int height, int strength)
#else
void SSE_RemoveGrain2(BYTE *dp, int dpitch, const BYTE *_sp, int spitch, int hblocks, int remainder, int incpitch, int height)
#endif
{
__asm mov eax, hblocks
__asm mov ebx, spitch
#ifdef MODIFYPLUGIN
__asm mov ecx, eax
#endif
__asm mov edx, remainder
#if SSE_INCREMENT == 16
__asm add eax, eax
#endif
__asm mov esi, _sp
#ifdef MODIFYPLUGIN
__asm lea eax, [eax * 8 + edx]
#else
__asm lea eax, [eax * 8 + edx + SSE_INCREMENT + 1]
#endif
__asm sub esi, ebx
__asm sub dpitch, eax
__asm neg eax
__asm mov edi, dp
#ifdef MODIFYPLUGIN
__asm inc edi
__asm lea eax, [ebx + eax]
#else
__asm lea eax, [ebx + eax + 1]
#ifdef SHARPEN
__asm mov spitch, eax
__asm mov eax, strength
#endif
__asm align 16
__asm column_loop:
__asm SSE3_MOVE SSE0, [esi]
__asm SSE3_MOVE SSE7, [esi + 1]
add2(SSE0, SSE1, SSE7)
__asm SSE3_MOVE SSE6, [esi + 2]
__asm SSE3_MOVE SSE7, [esi + ebx]
add3(SSE0, SSE1, SSE2, SSE6)
__asm movd [edi], SSE7
__asm SSE3_MOVE SSE6, [esi + ebx + 2]
add4(SSE0, SSE1, SSE2, SSE3, SSE7)
__asm SSE3_MOVE SSE5, [esi + 2*ebx]
minmax2(SSE0, SSE1, SSE2, SSE3, SSE6)
__asm SSE3_MOVE SSE7, [esi + 2*ebx + 1]
minmax2(SSE0, SSE1, SSE2, SSE3, SSE5)
__asm SSE3_MOVE SSE6, [esi + 2*ebx + 2]
minmax2(SSE0, SSE1, SSE2, SSE3, SSE7)
#if (ISSE > 1) || defined(SHLUR)
__asm SSE3_MOVE SSE5, [esi + ebx + 1]
#endif
minmax2sub(SSE0, SSE1, SSE2, SSE3, SSE6)
#ifdef SHLUR
sharpen(SSE5, SSE2, SSE1, rshift[eax], shift_mask[eax], SSE6, SSE7)
__asm SSE_MOVE [edi + 1], SSE5
#else
#if ISSE > 1
__asm pmaxub SSE2, SSE5
#else
__asm pmaxub SSE2, [esi + ebx + 1]
#endif
__asm pminub SSE1, SSE2
__asm SSE_MOVE [edi + 1], SSE1
#endif // SHLUR
// now the pixels in the middle
__asm add esi, SSE_INCREMENT
__asm add edi, SSE_INCREMENT + 1
__asm mov ecx, hblocks
#endif // MODIFYPLUGIN
__asm align 16
__asm middle_loop:
__asm SSE3_MOVE SSE0, [esi]
__asm SSE3_MOVE SSE7, [esi + 1]
add2(SSE0, SSE1, SSE7)
__asm SSE3_MOVE SSE6, [esi + 2]
__asm SSE3_MOVE SSE7, [esi + ebx]
add3(SSE0, SSE1, SSE2, SSE6)
#if MODIFYPLUGIN == 1
__asm SSE3_MOVE SSE4, [esi + ebx + 1]
#else
__asm SSE3_MOVE SSE6, [esi + ebx + 2]
#endif
add4(SSE0, SSE1, SSE2, SSE3, SSE7)
#if MODIFYPLUGIN == 1
__asm SSE3_MOVE SSE6, [esi + ebx + 2]
minmax2(SSE0, SSE1, SSE2, SSE3, SSE4)
#endif
__asm SSE3_MOVE SSE5, [esi + 2*ebx]
minmax2(SSE0, SSE1, SSE2, SSE3, SSE6)
__asm SSE3_MOVE SSE7, [esi + 2*ebx + 1]
minmax2(SSE0, SSE1, SSE2, SSE3, SSE5)
__asm SSE3_MOVE SSE6, [esi + 2*ebx + 2]
minmax2(SSE0, SSE1, SSE2, SSE3, SSE7)
#if (ISSE > 1) || defined(SHLUR)
#ifdef MODIFYPLUGIN
__asm SSE3_MOVE SSE5, [edi]
#else
__asm SSE3_MOVE SSE5, [esi + ebx + 1]
#endif
#endif
minmax2sub(SSE0, SSE1, SSE2, SSE3, SSE6)
#ifdef SHLUR
sharpen(SSE5, SSE2, SSE1, rshift[eax], shift_mask[eax], SSE6, SSE7)
__asm add esi, SSE_INCREMENT
__asm SSE_MOVE [edi], SSE5
#else
#if ISSE > 1
__asm pmaxub SSE2, SSE5
#else
#ifdef MODIFYPLUGIN
__asm pmaxub SSE2, [edi]
#else
__asm pmaxub SSE2, [esi + ebx + 1]
#endif
#endif // ISSE > 1
__asm pminub SSE1, SSE2
__asm add esi, SSE_INCREMENT
__asm SSE_MOVE [edi], SSE1
#endif // SHLUR
__asm add edi, SSE_INCREMENT
__asm dec ecx
__asm jnz middle_loop
// the last pixels
__asm add esi, edx
__asm add edi, edx
__asm SSE3_MOVE SSE0, [esi]
__asm SSE3_MOVE SSE7, [esi + 1]
add2(SSE0, SSE1, SSE7)
__asm SSE3_MOVE SSE6, [esi + 2]
__asm SSE3_MOVE SSE7, [esi + ebx]
add3(SSE0, SSE1, SSE2, SSE6)
#if MODIFYPLUGIN == 1
__asm SSE3_MOVE SSE4, [esi + ebx + 1]
#else
__asm SSE3_MOVE SSE6, [esi + ebx + 2]
#endif
add4(SSE0, SSE1, SSE2, SSE3, SSE7)
#if MODIFYPLUGIN == 1
__asm SSE3_MOVE SSE6, [esi + ebx + 2]
minmax2(SSE0, SSE1, SSE2, SSE3, SSE4)
#endif
#ifndef MODIFYPLUGIN
__asm SSE_MOVE [edi + 1], SSE6
#endif
__asm SSE3_MOVE SSE5, [esi + 2*ebx]
minmax2(SSE0, SSE1, SSE2, SSE3, SSE6)
__asm SSE3_MOVE SSE7, [esi + 2*ebx + 1]
minmax2(SSE0, SSE1, SSE2, SSE3, SSE5)
__asm SSE3_MOVE SSE6, [esi + 2*ebx + 2]
minmax2(SSE0, SSE1, SSE2, SSE3, SSE7)
#if (ISSE > 1) || defined(SHLUR)
#ifdef MODIFYPLUGIN
__asm SSE3_MOVE SSE5, [edi]
#else
__asm SSE3_MOVE SSE5, [esi + ebx + 1]
#endif
#endif
minmax2sub(SSE0, SSE1, SSE2, SSE3, SSE6)
#ifdef SHLUR
sharpen(SSE5, SSE2, SSE1, rshift[eax], shift_mask[eax], SSE6, SSE7)
#ifdef SHARPEN
__asm add esi, spitch
#else
__asm add esi, eax
#endif
__asm SSE_MOVE [edi], SSE5
#else
#if ISSE > 1
__asm pmaxub SSE2, SSE5
#else
#ifdef MODIFYPLUGIN
__asm pmaxub SSE2, [edi]
#else
__asm pmaxub SSE2, [esi + ebx + 1]
#endif
#endif // ISSE > 1
__asm pminub SSE1, SSE2
__asm add esi, eax
__asm SSE_MOVE [edi], SSE1
#endif // SHLUR
__asm add edi, dpitch
__asm dec height
#ifdef MODIFYPLUGIN
__asm mov ecx, hblocks
__asm jnz middle_loop
#else
__asm jnz column_loop
#endif
}
#ifdef SHARPEN
void SSE_RemoveGrain3(BYTE *dp, int dpitch, const BYTE *_sp, int spitch, int hblocks, int remainder, int incpitch, int height, int strength)
#else
void SSE_RemoveGrain3(BYTE *dp, int dpitch, const BYTE *_sp, int spitch, int hblocks, int remainder, int incpitch, int height)
#endif
{
__asm mov eax, hblocks
__asm mov ebx, spitch
#ifdef MODIFYPLUGIN
__asm mov ecx, eax
#endif
__asm mov edx, remainder
#if SSE_INCREMENT == 16
__asm add eax, eax
#endif
__asm mov esi, _sp
#ifdef MODIFYPLUGIN
__asm lea eax, [eax * 8 + edx]
#else
__asm lea eax, [eax * 8 + edx + SSE_INCREMENT + 1]
#endif
__asm sub esi, ebx
__asm sub dpitch, eax
__asm neg eax
__asm mov edi, dp
#ifdef MODIFYPLUGIN
__asm inc edi
__asm lea eax, [ebx + eax]
#else
__asm lea eax, [ebx + eax + 1]
#ifdef SHARPEN
__asm mov spitch, eax
__asm mov eax, strength
#endif
__asm align 16
__asm column_loop:
__asm SSE3_MOVE SSE0, [esi]
__asm SSE3_MOVE SSE7, [esi + 1]
add2(SSE0, SSE1, SSE7)
__asm SSE3_MOVE SSE6, [esi + 2]
__asm SSE3_MOVE SSE5, [esi + ebx]
add3(SSE0, SSE1, SSE2, SSE6)
__asm SSE3_MOVE SSE7, [esi + ebx + 2]
add4(SSE0, SSE1, SSE2, SSE3, SSE5)
__asm movd [edi], SSE5
__asm SSE3_MOVE SSE6, [esi + 2*ebx]
add5(SSE0, SSE1, SSE2, SSE3, SSE4, SSE7)
__asm SSE3_MOVE SSE7, [esi + 2*ebx + 1]
add6(SSE0, SSE1, SSE2, SSE3, SSE4, SSE5, SSE6)
__asm SSE3_MOVE SSE6, [esi + 2*ebx + 2]
minmax3sub(SSE0, SSE1, SSE2, SSE3, SSE4, SSE5, SSE7)
#if (ISSE > 1) || defined(SHLUR)
__asm SSE3_MOVE SSE7, [esi + ebx + 1]
#endif
minmax2sub(SSE1, SSE2, SSE3, SSE4, SSE6)
#ifdef SHLUR
sharpen(SSE7, SSE3, SSE2, rshift[eax], shift_mask[eax], SSE0, SSE5)
__asm SSE_MOVE [edi + 1], SSE7
#else
#if ISSE > 1
__asm pmaxub SSE3, SSE7
#else
__asm pmaxub SSE3, [esi + ebx + 1]
#endif
__asm pminub SSE3, SSE2
__asm SSE_MOVE [edi + 1], SSE3
#endif // SHLUR
// now the pixels in the middle
__asm add esi, SSE_INCREMENT
__asm add edi, SSE_INCREMENT + 1
__asm mov ecx, hblocks
#endif // MODIFYPLUGIN
__asm align 16
__asm middle_loop:
__asm SSE3_MOVE SSE0, [esi]
__asm SSE3_MOVE SSE7, [esi + 1]
add2(SSE0, SSE1, SSE7)
__asm SSE3_MOVE SSE6, [esi + 2]
__asm SSE3_MOVE SSE5, [esi + ebx]
add3(SSE0, SSE1, SSE2, SSE6)
__asm SSE3_MOVE SSE7, [esi + ebx + 2]
add4(SSE0, SSE1, SSE2, SSE3, SSE5)
#if MODIFYPLUGIN == 1
__asm SSE3_MOVE SSE6, [esi + ebx + 1]
add5(SSE0, SSE1, SSE2, SSE3, SSE4, SSE7)
__asm SSE3_MOVE SSE7, [esi + 2*ebx]
add6(SSE0, SSE1, SSE2, SSE3, SSE4, SSE5, SSE6)
__asm SSE3_MOVE SSE6, [esi + 2*ebx + 1]
minmax3(SSE0, SSE1, SSE2, SSE3, SSE4, SSE5, SSE7)
__asm SSE3_MOVE SSE7, [esi + 2*ebx + 2]
minmax3sub(SSE0, SSE1, SSE2, SSE3, SSE4, SSE5, SSE6)
#if ISSE > 1
__asm SSE3_MOVE SSE6, [edi]
#endif
minmax2sub(SSE1, SSE2, SSE3, SSE4, SSE7)
#if ISSE > 1
__asm pmaxub SSE3, SSE6
#else
__asm pmaxub SSE3, [edi]
#endif
__asm pminub SSE3, SSE2
__asm add esi, SSE_INCREMENT
__asm SSE_MOVE [edi], SSE3
#else // MODIFYPLUGIN == 1
__asm SSE3_MOVE SSE6, [esi + 2*ebx]
add5(SSE0, SSE1, SSE2, SSE3, SSE4, SSE7)
__asm SSE3_MOVE SSE7, [esi + 2*ebx + 1]
add6(SSE0, SSE1, SSE2, SSE3, SSE4, SSE5, SSE6)
__asm SSE3_MOVE SSE6, [esi + 2*ebx + 2]
minmax3sub(SSE0, SSE1, SSE2, SSE3, SSE4, SSE5, SSE7)
#if (ISSE > 1) || defined(SHLUR)
#ifdef MODIFYPLUGIN
__asm SSE3_MOVE SSE7, [edi]
#else
__asm SSE3_MOVE SSE7, [esi + ebx + 1]
#endif
#endif
minmax2sub(SSE1, SSE2, SSE3, SSE4, SSE6)
#ifdef SHLUR
sharpen(SSE7, SSE3, SSE2, rshift[eax], shift_mask[eax], SSE0, SSE5)
__asm add esi, SSE_INCREMENT
__asm SSE_MOVE [edi], SSE7
#else
#if ISSE > 1
__asm pmaxub SSE3, SSE7
#else
#ifdef MODIFYPLUGIN
__asm pmaxub SSE3, [edi]
#else
__asm pmaxub SSE3, [esi + ebx + 1]
#endif
#endif // ISSE > 1
__asm pminub SSE3, SSE2
__asm add esi, SSE_INCREMENT
__asm SSE_MOVE [edi], SSE3
#endif // SHLUR
#endif // MODIFYPLUGIN == 1
__asm add edi, SSE_INCREMENT
__asm dec ecx
__asm jnz middle_loop
// the last pixels
__asm add esi, edx
__asm add edi, edx
__asm SSE3_MOVE SSE0, [esi]
__asm SSE3_MOVE SSE7, [esi + 1]
add2(SSE0, SSE1, SSE7)
__asm SSE3_MOVE SSE6, [esi + 2]
__asm SSE3_MOVE SSE5, [esi + ebx]
add3(SSE0, SSE1, SSE2, SSE6)
__asm SSE3_MOVE SSE7, [esi + ebx + 2]
add4(SSE0, SSE1, SSE2, SSE3, SSE5)
#if MODIFYPLUGIN == 1
__asm SSE3_MOVE SSE6, [esi + ebx + 1]
add5(SSE0, SSE1, SSE2, SSE3, SSE4, SSE7)
__asm SSE3_MOVE SSE7, [esi + 2*ebx]
add6(SSE0, SSE1, SSE2, SSE3, SSE4, SSE5, SSE6)
__asm SSE3_MOVE SSE6, [esi + 2*ebx + 1]
minmax3(SSE0, SSE1, SSE2, SSE3, SSE4, SSE5, SSE7)
__asm SSE3_MOVE SSE7, [esi + 2*ebx + 2]
minmax3sub(SSE0, SSE1, SSE2, SSE3, SSE4, SSE5, SSE6)
#if ISSE > 1
__asm SSE3_MOVE SSE6, [edi]
#endif
minmax2sub(SSE1, SSE2, SSE3, SSE4, SSE7)
#if ISSE > 1
__asm pmaxub SSE3, SSE6
#else
__asm pmaxub SSE3, [edi]
#endif
__asm pminub SSE3, SSE2
__asm add esi, eax
__asm SSE_MOVE [edi], SSE3
#else // MODIFYPLUGIN == 1
#ifndef MODIFYPLUGIN
__asm SSE_MOVE [edi + 1], SSE7
#endif
__asm SSE3_MOVE SSE6, [esi + 2*ebx]
add5(SSE0, SSE1, SSE2, SSE3, SSE4, SSE7)
__asm SSE3_MOVE SSE7, [esi + 2*ebx + 1]
add6(SSE0, SSE1, SSE2, SSE3, SSE4, SSE5, SSE6)
__asm SSE3_MOVE SSE6, [esi + 2*ebx + 2]
minmax3sub(SSE0, SSE1, SSE2, SSE3, SSE4, SSE5, SSE7)
#if (ISSE > 1) || defined(SHLUR)
#ifdef MODIFYPLUGIN
__asm SSE3_MOVE SSE7, [edi]
#else
__asm SSE3_MOVE SSE7, [esi + ebx + 1]
#endif
#endif // ISSE > 1
minmax2sub(SSE1, SSE2, SSE3, SSE4, SSE6)
#ifdef SHLUR
sharpen(SSE7, SSE3, SSE2, rshift[eax], shift_mask[eax], SSE0, SSE5)
#ifdef SHARPEN
__asm add esi, spitch
#else
__asm add esi, eax
#endif
__asm SSE_MOVE [edi], SSE7
#else
#if ISSE > 1
__asm pmaxub SSE3, SSE7
#else
#ifdef MODIFYPLUGIN
__asm pmaxub SSE3, [edi]
#else
__asm pmaxub SSE3, [esi + ebx + 1]
#endif
#endif // ISSE > 1
__asm pminub SSE3, SSE2
__asm add esi, eax
__asm SSE_MOVE [edi], SSE3
#endif // SHLUR
#endif // MODIFYPLUGIN == 1
__asm add edi, dpitch
__asm dec height
#ifdef MODIFYPLUGIN
__asm mov ecx, hblocks
__asm jnz middle_loop
#else
__asm jnz column_loop
#endif
}
// if( weight2[i] <= weight1[i] ) { value1[i] = value2[i]; weight1[i] = weight2[i]; }
// value2 remains unchanged
// weight2 must be a SSE register, value1, value2, weight1 may very well be a memory variables
// but value1 and weight1 should be registers because they are used twice
#define mergeweighted(value1, weight1, value2, weight2) \
__asm pminub weight1, weight2 \
__asm pcmpeqb weight2, weight1 \
__asm psubusb value1, weight2 \
__asm pand weight2, value2 \
__asm por value1, weight2
#define merge2weighted(val1, val2, weight1, val1b, val2b, weight2) \
__asm pminub weight1, weight2 \
__asm pcmpeqb weight2, weight1 \
__asm psubusb val1, weight2 \
__asm psubusb val2, weight2 \
__asm pand val1b, weight2 \
__asm pand val2b, weight2 \
__asm por val1, val1b \
__asm por val2, val2b
#ifndef SHLUR
#if MODIFYPLUGIN > 0
#define diagweight5(oldp, newp, weight, center, bound1, bound2, reg1, reg2) \
__asm SSE3_MOVE newp, bound1 \
__asm SSE3_MOVE reg1, bound2 \
__asm SSE_RMOVE weight, newp \
__asm SSE_RMOVE reg2, oldp \
__asm pmaxub newp, reg1 \
__asm pminub weight, reg1 \
__asm pmaxub newp, center \
__asm pminub reg1, center \
__asm psubusb reg2, newp \
__asm pminub newp, oldp \
__asm pmaxub newp, weight \
__asm psubusb weight, oldp \
__asm pmaxub weight, reg2
#else
// the values bound1 and bound2 are loaded into SSE registers
// then oldp is clipped with min(bound1, bound2) and max(bound1, bound2)
// finally weight = |oldp - newp|
// oldp is left unchanged
#define diagweight5(oldp, newp, weight, center, bound1, bound2, reg1, reg2) \
__asm SSE3_MOVE newp, bound1 \
__asm SSE3_MOVE reg1, bound2 \
__asm SSE_RMOVE weight, newp \
__asm SSE_RMOVE reg2, oldp \
__asm pmaxub newp, reg1 \
__asm pminub weight, reg1 \
__asm psubusb reg2, newp \
__asm pminub newp, oldp \
__asm pmaxub newp, weight \
__asm psubusb weight, oldp \
__asm pmaxub weight, reg2
#endif
#ifdef MODIFYPLUGIN
#define diagweightw5(oldp, newp, weight, center, bound1, bound2, wmem, reg1, reg2) diagweight5(oldp, newp, weight, center, bound1, bound2, reg1, reg2)
#else
// same as diagweight5, but in addition bound2 is written to wmem
#define diagweightw5(oldp, newp, weight, center, bound1, bound2, wmem, reg1, reg2) \
__asm SSE3_MOVE newp, bound1 \
__asm SSE3_MOVE reg1, bound2 \
__asm SSE_RMOVE weight, newp \
__asm SSE_RMOVE reg2, oldp \
__asm pmaxub newp, reg1 \
__asm pminub weight, reg1 \
__asm psubusb reg2, newp \
__asm pminub newp, oldp \
__asm SSE_MOVE wmem, reg1 \
__asm pmaxub newp, weight \
__asm psubusb weight, oldp \
__asm pmaxub weight, reg2
#endif // MODIFYPLUGIN
void diag5(BYTE *dp, int dpitch, const BYTE *_sp, int spitch, int hblocks, int remainder, int incpitch, int height)
{
__asm mov eax, hblocks
__asm mov ebx, spitch
#ifdef MODIFYPLUGIN
__asm mov ecx, eax
#endif
__asm mov edx, remainder
#if SSE_INCREMENT == 16
__asm add eax, eax
#endif
__asm mov esi, _sp
#ifdef MODIFYPLUGIN
__asm lea eax, [eax * 8 + edx]
#else
__asm lea eax, [eax * 8 + edx + SSE_INCREMENT + 1]
#endif
__asm sub esi, ebx
__asm sub dpitch, eax
__asm neg eax
__asm mov edi, dp
#ifdef MODIFYPLUGIN
__asm inc edi
__asm lea eax, [ebx + eax]
#else
__asm lea eax, [ebx + eax + 1]
__asm align 16
__asm column_loop:
__asm SSE3_MOVE SSE0, [esi + ebx + 1]
diagweight5(SSE0, SSE1, SSE2, SSE5, [esi], [esi + 2*ebx + 2], SSE6, SSE7)
diagweight5(SSE0, SSE3, SSE4, SSE5, [esi + 2*ebx], [esi + 2], SSE6, SSE7)
mergeweighted(SSE1, SSE2, SSE3, SSE4)
diagweight5(SSE0, SSE3, SSE4, SSE5, [esi + 2*ebx + 1], [esi + 1], SSE6, SSE7)
mergeweighted(SSE1, SSE2, SSE3, SSE4)
diagweightw5(SSE0, SSE3, SSE4, SSE5, [esi + ebx + 2], [esi + ebx], [edi], SSE6, SSE7)
mergeweighted(SSE1, SSE2, SSE3, SSE4)
__asm SSE_MOVE [edi + 1], SSE1
// now the pixels in the middle
__asm add esi, SSE_INCREMENT
__asm add edi, SSE_INCREMENT + 1
__asm mov ecx, hblocks
#endif // MODIFYPLUGIN
__asm align 16
__asm middle_loop:
#ifdef MODIFYPLUGIN
__asm SSE3_MOVE SSE0, [edi]
#if MODIFYPLUGIN > 0
__asm SSE3_MOVE SSE5, [esi + ebx + 1]
#endif
#else
__asm SSE3_MOVE SSE0, [esi + ebx + 1]
#endif
diagweight5(SSE0, SSE1, SSE2, SSE5, [esi], [esi + 2*ebx + 2], SSE6, SSE7)
diagweight5(SSE0, SSE3, SSE4, SSE5, [esi + 2*ebx], [esi + 2], SSE6, SSE7)
mergeweighted(SSE1, SSE2, SSE3, SSE4)
diagweight5(SSE0, SSE3, SSE4, SSE5, [esi + 2*ebx + 1], [esi + 1], SSE6, SSE7)
mergeweighted(SSE1, SSE2, SSE3, SSE4)
diagweight5(SSE0, SSE3, SSE4, SSE5, [esi + ebx + 2], [esi + ebx], SSE6, SSE7)
mergeweighted(SSE1, SSE2, SSE3, SSE4)
__asm add esi, SSE_INCREMENT
__asm SSE_MOVE [edi], SSE1
__asm add edi, SSE_INCREMENT
__asm dec ecx
__asm jnz middle_loop
// the last pixels
__asm add esi, edx
__asm add edi, edx
#ifdef MODIFYPLUGIN
__asm SSE3_MOVE SSE0, [edi]
#if MODIFYPLUGIN > 0
__asm SSE3_MOVE SSE5, [esi + ebx + 1]
#endif
#else
__asm SSE3_MOVE SSE0, [esi + ebx + 1]
#endif
diagweight5(SSE0, SSE1, SSE2, SSE5, [esi], [esi + 2*ebx + 2], SSE6, SSE7)
diagweight5(SSE0, SSE3, SSE4, SSE5, [esi + 2*ebx], [esi + 2], SSE6, SSE7)
mergeweighted(SSE1, SSE2, SSE3, SSE4)
diagweight5(SSE0, SSE3, SSE4, SSE5, [esi + 2*ebx + 1], [esi + 1], SSE6, SSE7)
mergeweighted(SSE1, SSE2, SSE3, SSE4)
diagweightw5(SSE0, SSE3, SSE4, SSE5, [esi + ebx], [esi + ebx + 2], [edi + 1], SSE6, SSE7)
mergeweighted(SSE1, SSE2, SSE3, SSE4)
__asm SSE_MOVE [edi], SSE1
__asm add esi, eax
__asm add edi, dpitch
__asm dec height
#ifdef MODIFYPLUGIN
__asm mov ecx, hblocks
__asm jnz middle_loop
#else
__asm jnz column_loop
#endif
}
#if MODIFYPLUGIN > 0
#define diagweight6(oldp, newp, weight, center, bound1, bound2, reg1, reg2) \
__asm SSE3_MOVE newp, bound1 \
__asm SSE3_MOVE reg1, bound2 \
__asm SSE_RMOVE weight, newp \
__asm SSE_RMOVE reg2, oldp \
__asm pmaxub newp, reg1 \
__asm pminub weight, reg1 \
__asm pmaxub newp, center \
__asm pminub weight, center \
__asm psubusb reg2, newp \
__asm SSE_RMOVE reg1, newp \
__asm pminub newp, oldp \
__asm psubusb reg1, weight \
__asm pmaxub newp, weight \
__asm psubusb weight, oldp \
__asm pmaxub weight, reg2 \
__asm paddusb weight, weight \
__asm paddusb weight, reg1
#else
// the values bound1 and bound2 are loaded into SSE registers
// then oldp is clipped with min(bound1, bound2) and max(bound1, bound2)
// finally weight = 2*|oldp - newp| + |bound1 - bound2|
// oldp is left unchanged
#define diagweight6(oldp, newp, weight, center, bound1, bound2, reg1, reg2) \
__asm SSE3_MOVE newp, bound1 \
__asm SSE3_MOVE reg1, bound2 \
__asm SSE_RMOVE weight, newp \
__asm SSE_RMOVE reg2, oldp \
__asm pmaxub newp, reg1 \
__asm pminub weight, reg1 \
__asm psubusb reg2, newp \
__asm SSE_RMOVE reg1, newp \
__asm pminub newp, oldp \
__asm psubusb reg1, weight \
__asm pmaxub newp, weight \
__asm psubusb weight, oldp \
__asm pmaxub weight, reg2 \
__asm paddusb weight, weight \
__asm paddusb weight, reg1
#endif
#ifdef MODIFYPLUGIN
#define diagweightw6(oldp, newp, weight, center, bound1, bound2, wmem, reg1, reg2) diagweight6(oldp, newp, weight, center, bound1, bound2, reg1, reg2)
#else
// same as diagweight6, but in addition bound2 is written to wmem
#define diagweightw6(oldp, newp, weight, center, bound1, bound2, wmem, reg1, reg2) \
__asm SSE3_MOVE newp, bound1 \
__asm SSE3_MOVE reg1, bound2 \
__asm SSE_RMOVE weight, newp \
__asm SSE_RMOVE reg2, oldp \
__asm SSE_MOVE wmem, reg1 \
__asm pmaxub newp, reg1 \
__asm pminub weight, reg1 \
__asm psubusb reg2, newp \
__asm SSE_RMOVE reg1, newp \
__asm pminub newp, oldp \
__asm psubusb reg1, weight \
__asm pmaxub newp, weight \
__asm psubusb weight, oldp \
__asm pmaxub weight, reg2 \
__asm paddusb weight, weight \
__asm paddusb weight, reg1
#endif // MODIFYPLUGIN
void diag6(BYTE *dp, int dpitch, const BYTE *_sp, int spitch, int hblocks, int remainder, int incpitch, int height)
{
__asm mov eax, hblocks
__asm mov ebx, spitch
#ifdef MODIFYPLUGIN
__asm mov ecx, eax
#endif
__asm mov edx, remainder
#if SSE_INCREMENT == 16
__asm add eax, eax
#endif
__asm mov esi, _sp
#ifdef MODIFYPLUGIN
__asm lea eax, [eax * 8 + edx]
#else
__asm lea eax, [eax * 8 + edx + SSE_INCREMENT + 1]
#endif
__asm sub esi, ebx
__asm sub dpitch, eax
__asm neg eax
__asm mov edi, dp
#ifdef MODIFYPLUGIN
__asm inc edi
__asm lea eax, [ebx + eax]
#else
__asm lea eax, [ebx + eax + 1]
__asm align 16
__asm column_loop:
__asm SSE3_MOVE SSE0, [esi + ebx + 1]
diagweight6(SSE0, SSE1, SSE2, SSE5, [esi], [esi + 2*ebx + 2], SSE6, SSE7)
diagweight6(SSE0, SSE3, SSE4, SSE5, [esi + 2*ebx], [esi + 2], SSE6, SSE7)
mergeweighted(SSE1, SSE2, SSE3, SSE4)
diagweight6(SSE0, SSE3, SSE4, SSE5, [esi + 2*ebx + 1], [esi + 1], SSE6, SSE7)
mergeweighted(SSE1, SSE2, SSE3, SSE4)
diagweightw6(SSE0, SSE3, SSE4, SSE5, [esi + ebx + 2], [esi + ebx], [edi], SSE6, SSE7)
mergeweighted(SSE1, SSE2, SSE3, SSE4)
__asm SSE_MOVE [edi + 1], SSE1
// now the pixels in the middle
__asm add esi, SSE_INCREMENT
__asm add edi, SSE_INCREMENT + 1
__asm mov ecx, hblocks
#endif // MODIFYPLUGIN
__asm align 16
__asm middle_loop:
#ifdef MODIFYPLUGIN
__asm SSE3_MOVE SSE0, [edi]
#if MODIFYPLUGIN > 0
__asm SSE3_MOVE SSE5, [esi + ebx + 1]
#endif
#else
__asm SSE3_MOVE SSE0, [esi + ebx + 1]
#endif
diagweight6(SSE0, SSE1, SSE2, SSE5, [esi], [esi + 2*ebx + 2], SSE6, SSE7)
diagweight6(SSE0, SSE3, SSE4, SSE5, [esi + 2*ebx], [esi + 2], SSE6, SSE7)
mergeweighted(SSE1, SSE2, SSE3, SSE4)
diagweight6(SSE0, SSE3, SSE4, SSE5, [esi + 2*ebx + 1], [esi + 1], SSE6, SSE7)
mergeweighted(SSE1, SSE2, SSE3, SSE4)
diagweight6(SSE0, SSE3, SSE4, SSE5, [esi + ebx + 2], [esi + ebx], SSE6, SSE7)
mergeweighted(SSE1, SSE2, SSE3, SSE4)
__asm add esi, SSE_INCREMENT
__asm SSE_MOVE [edi], SSE1
__asm add edi, SSE_INCREMENT
__asm dec ecx
__asm jnz middle_loop
// the last pixels
__asm add esi, edx
__asm add edi, edx
#ifdef MODIFYPLUGIN
__asm SSE3_MOVE SSE0, [edi]
#if MODIFYPLUGIN > 0
__asm SSE3_MOVE SSE5, [esi + ebx + 1]
#endif
#else
__asm SSE3_MOVE SSE0, [esi + ebx + 1]
#endif
diagweight6(SSE0, SSE1, SSE2, SSE5, [esi], [esi + 2*ebx + 2], SSE6, SSE7)
diagweight6(SSE0, SSE3, SSE4, SSE5, [esi + 2*ebx], [esi + 2], SSE6, SSE7)
mergeweighted(SSE1, SSE2, SSE3, SSE4)
diagweight6(SSE0, SSE3, SSE4, SSE5, [esi + 2*ebx + 1], [esi + 1], SSE6, SSE7)
mergeweighted(SSE1, SSE2, SSE3, SSE4)
diagweightw6(SSE0, SSE3, SSE4, SSE5, [esi + ebx], [esi + ebx + 2], [edi + 1], SSE6, SSE7)
mergeweighted(SSE1, SSE2, SSE3, SSE4)
__asm SSE_MOVE [edi], SSE1
__asm add esi, eax
__asm add edi, dpitch
__asm dec height
#ifdef MODIFYPLUGIN
__asm mov ecx, hblocks
__asm jnz middle_loop
#else
__asm jnz column_loop
#endif
}
#if MODIFYPLUGIN > 0
#define diagweight7(oldp, newp, weight, center, bound1, bound2, reg1, reg2) \
__asm SSE3_MOVE newp, bound1 \
__asm SSE3_MOVE reg1, bound2 \
__asm SSE_RMOVE weight, newp \
__asm SSE_RMOVE reg2, oldp \
__asm pmaxub newp, reg1 \
__asm pminub weight, reg1 \
__asm pmaxub newp, center \
__asm pminub weight, center \
__asm psubusb reg2, newp \
__asm SSE_RMOVE reg1, newp \
__asm pminub newp, oldp \
__asm psubusb reg1, weight \
__asm pmaxub newp, weight \
__asm psubusb weight, oldp \
__asm pmaxub weight, reg2 \
__asm paddusb weight, reg1
#else
// the values bound1 and bound2 are loaded into SSE registers
// then oldp is clipped with min(bound1, bound2) and max(bound1, bound2)
// finally weight = |oldp - newp| + |bound1 - bound2|
// oldp is left unchanged
#define diagweight7(oldp, newp, weight, center, bound1, bound2, reg1, reg2) \
__asm SSE3_MOVE newp, bound1 \
__asm SSE3_MOVE reg1, bound2 \
__asm SSE_RMOVE weight, newp \
__asm SSE_RMOVE reg2, oldp \
__asm pmaxub newp, reg1 \
__asm pminub weight, reg1 \
__asm psubusb reg2, newp \
__asm SSE_RMOVE reg1, newp \
__asm pminub newp, oldp \
__asm psubusb reg1, weight \
__asm pmaxub newp, weight \
__asm psubusb weight, oldp \
__asm pmaxub weight, reg2 \
__asm paddusb weight, reg1
#endif
#ifdef MODIFYPLUGIN
#define diagweightw7(oldp, newp, weight, center, bound1, bound2, wmem, reg1, reg2) diagweight7(oldp, newp, weight, center, bound1, bound2, reg1, reg2)
#else
// same as diagweight7, but in addition bound2 is written to wmem
#define diagweightw7(oldp, newp, weight, center, bound1, bound2, wmem, reg1, reg2) \
__asm SSE3_MOVE newp, bound1 \
__asm SSE3_MOVE reg1, bound2 \
__asm SSE_RMOVE weight, newp \
__asm SSE_RMOVE reg2, oldp \
__asm SSE_MOVE wmem, reg1 \
__asm pmaxub newp, reg1 \
__asm pminub weight, reg1 \
__asm psubusb reg2, newp \
__asm SSE_RMOVE reg1, newp \
__asm pminub newp, oldp \
__asm psubusb reg1, weight \
__asm pmaxub newp, weight \
__asm psubusb weight, oldp \
__asm pmaxub weight, reg2 \
__asm paddusb weight, reg1
#endif // MODIFYPLUGIN
void diag7(BYTE *dp, int dpitch, const BYTE *_sp, int spitch, int hblocks, int remainder, int incpitch, int height)
{
__asm mov eax, hblocks
__asm mov ebx, spitch
#ifdef MODIFYPLUGIN
__asm mov ecx, eax
#endif
__asm mov edx, remainder
#if SSE_INCREMENT == 16
__asm add eax, eax
#endif
__asm mov esi, _sp
#ifdef MODIFYPLUGIN
__asm lea eax, [eax * 8 + edx]
#else
__asm lea eax, [eax * 8 + edx + SSE_INCREMENT + 1]
#endif
__asm sub esi, ebx
__asm sub dpitch, eax
__asm neg eax
__asm mov edi, dp
#ifdef MODIFYPLUGIN
__asm inc edi
__asm lea eax, [ebx + eax]
#else
__asm lea eax, [ebx + eax + 1]
__asm align 16
__asm column_loop:
__asm SSE3_MOVE SSE0, [esi + ebx + 1]
diagweight7(SSE0, SSE1, SSE2, SSE5, [esi], [esi + 2*ebx + 2], SSE6, SSE7)
diagweight7(SSE0, SSE3, SSE4, SSE5, [esi + 2*ebx], [esi + 2], SSE6, SSE7)
mergeweighted(SSE1, SSE2, SSE3, SSE4)
diagweight7(SSE0, SSE3, SSE4, SSE5, [esi + 2*ebx + 1], [esi + 1], SSE6, SSE7)
mergeweighted(SSE1, SSE2, SSE3, SSE4)
diagweightw7(SSE0, SSE3, SSE4, SSE5, [esi + ebx + 2], [esi + ebx], [edi], SSE6, SSE7)
mergeweighted(SSE1, SSE2, SSE3, SSE4)
__asm SSE_MOVE [edi + 1], SSE1
// now the pixels in the middle
__asm add esi, SSE_INCREMENT
__asm add edi, SSE_INCREMENT + 1
__asm mov ecx, hblocks
#endif // MODIFYPLUGIN
__asm align 16
__asm middle_loop:
#ifdef MODIFYPLUGIN
__asm SSE3_MOVE SSE0, [edi]
#if MODIFYPLUGIN > 0
__asm SSE3_MOVE SSE5, [esi + ebx + 1]
#endif
#else
__asm SSE3_MOVE SSE0, [esi + ebx + 1]
#endif
diagweight7(SSE0, SSE1, SSE2, SSE5, [esi], [esi + 2*ebx + 2], SSE6, SSE7)
diagweight7(SSE0, SSE3, SSE4, SSE5, [esi + 2*ebx], [esi + 2], SSE6, SSE7)
mergeweighted(SSE1, SSE2, SSE3, SSE4)
diagweight7(SSE0, SSE3, SSE4, SSE5, [esi + 2*ebx + 1], [esi + 1], SSE6, SSE7)
mergeweighted(SSE1, SSE2, SSE3, SSE4)
diagweight7(SSE0, SSE3, SSE4, SSE5, [esi + ebx + 2], [esi + ebx], SSE6, SSE7)
mergeweighted(SSE1, SSE2, SSE3, SSE4)
__asm add esi, SSE_INCREMENT
__asm SSE_MOVE [edi], SSE1
__asm add edi, SSE_INCREMENT
__asm dec ecx
__asm jnz middle_loop
// the last pixels
__asm add esi, edx
__asm add edi, edx
#ifdef MODIFYPLUGIN
__asm SSE3_MOVE SSE0, [edi]
#if MODIFYPLUGIN > 0
__asm SSE3_MOVE SSE5, [esi + ebx + 1]
#endif
#else
__asm SSE3_MOVE SSE0, [esi + ebx + 1]
#endif
diagweight7(SSE0, SSE1, SSE2, SSE5, [esi], [esi + 2*ebx + 2], SSE6, SSE7)
diagweight7(SSE0, SSE3, SSE4, SSE5, [esi + 2*ebx], [esi + 2], SSE6, SSE7)
mergeweighted(SSE1, SSE2, SSE3, SSE4)
diagweight7(SSE0, SSE3, SSE4, SSE5, [esi + 2*ebx + 1], [esi + 1], SSE6, SSE7)
mergeweighted(SSE1, SSE2, SSE3, SSE4)
diagweightw7(SSE0, SSE3, SSE4, SSE5, [esi + ebx], [esi + ebx + 2], [edi + 1], SSE6, SSE7)
mergeweighted(SSE1, SSE2, SSE3, SSE4)
__asm SSE_MOVE [edi], SSE1
__asm add esi, eax
__asm add edi, dpitch
__asm dec height
#ifdef MODIFYPLUGIN
__asm mov ecx, hblocks
__asm jnz middle_loop
#else
__asm jnz column_loop
#endif
}
void diag7b(BYTE *dp, int dpitch, const BYTE *_sp, int spitch, int hblocks, int remainder, int incpitch, int height)
{
__asm mov eax, hblocks
__asm mov ebx, spitch
#ifdef MODIFYPLUGIN
__asm mov ecx, eax
#endif
__asm mov edx, remainder
#if SSE_INCREMENT == 16
__asm add eax, eax
#endif
__asm mov esi, _sp
#ifdef MODIFYPLUGIN
__asm lea eax, [eax * 8 + edx]
#else
__asm lea eax, [eax * 8 + edx + SSE_INCREMENT + 1]
#endif
__asm sub esi, ebx
__asm sub dpitch, eax
__asm neg eax
__asm mov edi, dp
#ifdef MODIFYPLUGIN
__asm inc edi
__asm lea eax, [ebx + eax]
#else
__asm lea eax, [ebx + eax + 1]
__asm align 16
__asm column_loop:
__asm SSE3_MOVE SSE0, [esi + ebx + 1]
diagweight7(SSE0, SSE1, SSE2, SSE5, [esi], [esi + 2*ebx + 2], SSE6, SSE7)
diagweight7(SSE0, SSE3, SSE4, SSE5, [esi + 2*ebx], [esi + 2], SSE6, SSE7)
__asm movd SSE6, [esi + ebx]
mergeweighted(SSE1, SSE2, SSE3, SSE4)
__asm movd [edi], SSE6
diagweight7(SSE0, SSE3, SSE4, SSE5, [esi + 2*ebx + 1], [esi + 1], SSE6, SSE7)
mergeweighted(SSE1, SSE2, SSE3, SSE4)
__asm SSE_MOVE [edi + 1], SSE1
// now the pixels in the middle
__asm add esi, SSE_INCREMENT
__asm add edi, SSE_INCREMENT + 1
__asm mov ecx, hblocks
#endif // MODIFYPLUGIN
__asm align 16
__asm middle_loop:
#ifdef MODIFYPLUGIN
__asm SSE3_MOVE SSE0, [edi]
#if MODIFYPLUGIN > 0
__asm SSE3_MOVE SSE5, [esi + ebx + 1]
#endif
#else
__asm SSE3_MOVE SSE0, [esi + ebx + 1]
#endif
diagweight7(SSE0, SSE1, SSE2, SSE5, [esi], [esi + 2*ebx + 2], SSE6, SSE7)
diagweight7(SSE0, SSE3, SSE4, SSE5, [esi + 2*ebx], [esi + 2], SSE6, SSE7)
mergeweighted(SSE1, SSE2, SSE3, SSE4)
diagweight7(SSE0, SSE3, SSE4, SSE5, [esi + 2*ebx + 1], [esi + 1], SSE6, SSE7)
mergeweighted(SSE1, SSE2, SSE3, SSE4)
__asm add esi, SSE_INCREMENT
__asm SSE_MOVE [edi], SSE1
__asm add edi, SSE_INCREMENT
__asm dec ecx
__asm jnz middle_loop
// the last pixels
__asm add esi, edx
__asm add edi, edx
#ifdef MODIFYPLUGIN
__asm SSE3_MOVE SSE0, [edi]
#if MODIFYPLUGIN > 0
__asm SSE3_MOVE SSE5, [esi + ebx + 1]
#endif
#else
__asm SSE3_MOVE SSE0, [esi + ebx + 1]
#endif
diagweight7(SSE0, SSE1, SSE2, SSE5, [esi], [esi + 2*ebx + 2], SSE6, SSE7)
diagweight7(SSE0, SSE3, SSE4, SSE5, [esi + 2*ebx], [esi + 2], SSE6, SSE7)
__asm movd SSE6, [esi + ebx + 6]
mergeweighted(SSE1, SSE2, SSE3, SSE4)
__asm movd [edi + 5], SSE6
diagweight7(SSE0, SSE3, SSE4, SSE5, [esi + 2*ebx + 1], [esi + 1], SSE6, SSE7)
mergeweighted(SSE1, SSE2, SSE3, SSE4)
__asm SSE_MOVE [edi], SSE1
__asm add esi, eax
__asm add edi, dpitch
__asm dec height
#ifdef MODIFYPLUGIN
__asm mov ecx, hblocks
__asm jnz middle_loop
#else
__asm jnz column_loop
#endif
}
#if MODIFYPLUGIN > 0
#define diagweight8(oldp, newp, weight, center, bound1, bound2, reg1, reg2) \
__asm SSE3_MOVE newp, bound1 \
__asm SSE3_MOVE reg1, bound2 \
__asm SSE_RMOVE weight, newp \
__asm SSE_RMOVE reg2, oldp \
__asm pmaxub newp, reg1 \
__asm pminub weight, reg1 \
__asm pmaxub newp, center \
__asm pminub weight, center \
__asm psubusb reg2, newp \
__asm SSE_RMOVE reg1, newp \
__asm pminub newp, oldp \
__asm psubusb reg1, weight \
__asm pmaxub newp, weight \
__asm psubusb weight, oldp \
__asm paddusb reg1, reg1 \
__asm pmaxub weight, reg2 \
__asm paddusb weight, reg1
#else
// the values bound1 and bound2 are loaded into SSE registers
// then oldp is clipped with min(bound1, bound2) and max(bound1, bound2)
// finally weight = |oldp - newp| + 2*|bound1 - bound2|
// oldp is left unchanged
#define diagweight8(oldp, newp, weight, center, bound1, bound2, reg1, reg2) \
__asm SSE3_MOVE newp, bound1 \
__asm SSE3_MOVE reg1, bound2 \
__asm SSE_RMOVE weight, newp \
__asm SSE_RMOVE reg2, oldp \
__asm pmaxub newp, reg1 \
__asm pminub weight, reg1 \
__asm psubusb reg2, newp \
__asm SSE_RMOVE reg1, newp \
__asm pminub newp, oldp \
__asm psubusb reg1, weight \
__asm pmaxub newp, weight \
__asm psubusb weight, oldp \
__asm paddusb reg1, reg1 \
__asm pmaxub weight, reg2 \
__asm paddusb weight, reg1
#endif
#ifdef MODIFYPLUGIN
#define diagweightw8(oldp, newp, weight, center, bound1, bound2, wmem, reg1, reg2) diagweight8(oldp, newp, weight, center, bound1, bound2, reg1, reg2)
#else
// same as diagweight8, but in addition bound2 is written to wmem
#define diagweightw8(oldp, newp, weight, center, bound1, bound2, wmem, reg1, reg2) \
__asm SSE3_MOVE newp, bound1 \
__asm SSE3_MOVE reg1, bound2 \
__asm SSE_RMOVE weight, newp \
__asm SSE_RMOVE reg2, oldp \
__asm SSE_MOVE wmem, reg1 \
__asm pmaxub newp, reg1 \
__asm pminub weight, reg1 \
__asm psubusb reg2, newp \
__asm SSE_RMOVE reg1, newp \
__asm pminub newp, oldp \
__asm psubusb reg1, weight \
__asm pmaxub newp, weight \
__asm psubusb weight, oldp \
__asm paddusb reg1, reg1 \
__asm pmaxub weight, reg2 \
__asm paddusb weight, reg1
#endif // MODIFYPLUGIN
void diag8(BYTE *dp, int dpitch, const BYTE *_sp, int spitch, int hblocks, int remainder, int incpitch, int height)
{
__asm mov eax, hblocks
__asm mov ebx, spitch
#ifdef MODIFYPLUGIN
__asm mov ecx, eax
#endif
__asm mov edx, remainder
#if SSE_INCREMENT == 16
__asm add eax, eax
#endif
__asm mov esi, _sp
#ifdef MODIFYPLUGIN
__asm lea eax, [eax * 8 + edx]
#else
__asm lea eax, [eax * 8 + edx + SSE_INCREMENT + 1]
#endif
__asm sub esi, ebx
__asm sub dpitch, eax
__asm neg eax
__asm mov edi, dp
#ifdef MODIFYPLUGIN
__asm inc edi
__asm lea eax, [ebx + eax]
#else
__asm lea eax, [ebx + eax + 1]
__asm align 16
__asm column_loop:
__asm SSE3_MOVE SSE0, [esi + ebx + 1]
diagweight8(SSE0, SSE1, SSE2, SSE5, [esi], [esi + 2*ebx + 2], SSE6, SSE7)
diagweight8(SSE0, SSE3, SSE4, SSE5, [esi + 2*ebx], [esi + 2], SSE6, SSE7)
mergeweighted(SSE1, SSE2, SSE3, SSE4)
diagweight8(SSE0, SSE3, SSE4, SSE5, [esi + 2*ebx + 1], [esi + 1], SSE6, SSE7)
mergeweighted(SSE1, SSE2, SSE3, SSE4)
diagweightw8(SSE0, SSE3, SSE4, SSE5, [esi + ebx + 2], [esi + ebx], [edi], SSE6, SSE7)
mergeweighted(SSE1, SSE2, SSE3, SSE4)
__asm SSE_MOVE [edi + 1], SSE1
// now the pixels in the middle
__asm add esi, SSE_INCREMENT
__asm add edi, SSE_INCREMENT + 1
__asm mov ecx, hblocks
#endif // MODIFYPLUGIN
__asm align 16
__asm middle_loop:
#ifdef MODIFYPLUGIN
__asm SSE3_MOVE SSE0, [edi]
#if MODIFYPLUGIN > 0
__asm SSE3_MOVE SSE5, [esi + ebx + 1]
#endif
#else
__asm SSE3_MOVE SSE0, [esi + ebx + 1]
#endif
diagweight8(SSE0, SSE1, SSE2, SSE5, [esi], [esi + 2*ebx + 2], SSE6, SSE7)
diagweight8(SSE0, SSE3, SSE4, SSE5, [esi + 2*ebx], [esi + 2], SSE6, SSE7)
mergeweighted(SSE1, SSE2, SSE3, SSE4)
diagweight8(SSE0, SSE3, SSE4, SSE5, [esi + 2*ebx + 1], [esi + 1], SSE6, SSE7)
mergeweighted(SSE1, SSE2, SSE3, SSE4)
diagweight8(SSE0, SSE3, SSE4, SSE5, [esi + ebx + 2], [esi + ebx], SSE6, SSE7)
mergeweighted(SSE1, SSE2, SSE3, SSE4)
__asm add esi, SSE_INCREMENT
__asm SSE_MOVE [edi], SSE1
__asm add edi, SSE_INCREMENT
__asm dec ecx
__asm jnz middle_loop
// the last pixels
__asm add esi, edx
__asm add edi, edx
#ifdef MODIFYPLUGIN
__asm SSE3_MOVE SSE0, [edi]
#if MODIFYPLUGIN > 0
__asm SSE3_MOVE SSE5, [esi + ebx + 1]
#endif
#else
__asm SSE3_MOVE SSE0, [esi + ebx + 1]
#endif
diagweight8(SSE0, SSE1, SSE2, SSE5, [esi], [esi + 2*ebx + 2], SSE6, SSE7)
diagweight8(SSE0, SSE3, SSE4, SSE5, [esi + 2*ebx], [esi + 2], SSE6, SSE7)
mergeweighted(SSE1, SSE2, SSE3, SSE4)
diagweight8(SSE0, SSE3, SSE4, SSE5, [esi + 2*ebx + 1], [esi + 1], SSE6, SSE7)
mergeweighted(SSE1, SSE2, SSE3, SSE4)
diagweightw8(SSE0, SSE3, SSE4, SSE5, [esi + ebx], [esi + ebx + 2], [edi + 1], SSE6, SSE7)
mergeweighted(SSE1, SSE2, SSE3, SSE4)
__asm SSE_MOVE [edi], SSE1
__asm add esi, eax
__asm add edi, dpitch
__asm dec height
#ifdef MODIFYPLUGIN
__asm mov ecx, hblocks
__asm jnz middle_loop
#else
__asm jnz column_loop
#endif
}
#endif // #ifndef SHLUR
#if MODIFYPLUGIN > 0
#define get_min_weight(min, weight, center, mem1, mem2,reg) \
__asm SSE3_MOVE min, mem1 \
__asm SSE3_MOVE reg, mem2 \
__asm SSE_RMOVE weight, min \
__asm pminub min, center \
__asm pmaxub weight, center \
__asm pminub min, reg \
__asm pmaxub weight, reg \
__asm psubusb weight, min
#else
#define get_min_weight(min, weight, center, mem1, mem2,reg) \
__asm SSE3_MOVE min, mem1 \
__asm SSE3_MOVE reg, mem2 \
__asm SSE_RMOVE weight, min \
__asm pminub min, reg \
__asm pmaxub weight, reg \
__asm psubusb weight, min
#endif
#ifdef MODIFYPLUGIN
#define get_min_weightw(min, weight, center, mem1, mem2, wmem, reg) get_min_weight(min, weight, center, mem1, mem2,reg)
#else
#define get_min_weightw(min, weight, center, mem1, mem2, wmem, reg) \
__asm SSE3_MOVE min, mem1 \
__asm SSE3_MOVE reg, mem2 \
__asm SSE_RMOVE weight, min \
__asm pminub min, reg \
__asm pmaxub weight, reg \
__asm SSE_MOVE wmem, reg \
__asm psubb weight, min
#endif // MODIFYPLUGIN
#ifdef SHARPEN
void diag9(BYTE *dp, int dpitch, const BYTE *_sp, int spitch, int hblocks, int remainder, int incpitch, int height, int strength)
#else
void diag9(BYTE *dp, int dpitch, const BYTE *_sp, int spitch, int hblocks, int remainder, int incpitch, int height)
#endif
{
#ifdef SHARPEN
__asm mov ebx, strength
__asm SSE_RMOVE SSE4, rshift[ebx]
__asm SSE_RMOVE SSE6, shift_mask[ebx]
#endif
__asm mov eax, hblocks
__asm mov ebx, spitch
#ifdef MODIFYPLUGIN
__asm mov ecx, eax
#endif
__asm mov edx, remainder
#if SSE_INCREMENT == 16
__asm add eax, eax
#endif
__asm mov esi, _sp
#ifdef MODIFYPLUGIN
__asm lea eax, [eax * 8 + edx]
#else
__asm lea eax, [eax * 8 + edx + SSE_INCREMENT + 1]
#endif
__asm sub esi, ebx
__asm sub dpitch, eax
__asm neg eax
__asm mov edi, dp
#ifdef MODIFYPLUGIN
__asm inc edi
__asm lea eax, [ebx + eax]
#else
__asm lea eax, [ebx + eax + 1]
__asm align 16
__asm column_loop:
get_min_weight(SSE0, SSE1, SSE5, [esi], [esi + 2*ebx + 2], SSE7)
get_min_weight(SSE2, SSE3, SSE5, [esi + 2*ebx], [esi + 2], SSE7)
mergeweighted(SSE0, SSE1, SSE2, SSE3)
get_min_weight(SSE2, SSE3, SSE5, [esi + 2*ebx + 1], [esi + 1], SSE7)
mergeweighted(SSE0, SSE1, SSE2, SSE3)
get_min_weightw(SSE2, SSE3, SSE5, [esi + ebx + 2], [esi + ebx], [edi], SSE7)
mergeweighted(SSE0, SSE1, SSE2, SSE3)
#if (ISSE > 1) || defined(SHLUR)
__asm SSE3_MOVE SSE7, [esi + ebx + 1]
#endif
__asm paddusb SSE1, SSE0
#ifdef SHLUR
sharpen(SSE7, SSE0, SSE1, SSE4, SSE6, SSE2, SSE3)
__asm SSE_MOVE [edi + 1], SSE7
#else
#if ISSE > 1
__asm pmaxub SSE0, SSE7
#else
__asm pmaxub SSE0, [esi + ebx + 1]
#endif
__asm pminub SSE0, SSE1
__asm SSE_MOVE [edi + 1], SSE0
#endif // SHLUR
// now the pixels in the middle
__asm add esi, SSE_INCREMENT
__asm add edi, SSE_INCREMENT + 1
__asm mov ecx, hblocks
#endif // MODIFYPLUGIN
__asm align 16
__asm middle_loop:
#if MODIFYPLUGIN > 0
__asm SSE3_MOVE SSE5, [esi + ebx + 1]
#endif
get_min_weight(SSE0, SSE1, SSE5,[esi], [esi + 2*ebx + 2], SSE7)
get_min_weight(SSE2, SSE3, SSE5, [esi + 2*ebx], [esi + 2], SSE7)
mergeweighted(SSE0, SSE1, SSE2, SSE3)
get_min_weight(SSE2, SSE3, SSE5, [esi + 2*ebx + 1], [esi + 1], SSE7)
mergeweighted(SSE0, SSE1, SSE2, SSE3)
get_min_weight(SSE2, SSE3, SSE5, [esi + ebx + 2], [esi + ebx], SSE7)
mergeweighted(SSE0, SSE1, SSE2, SSE3)
#if (ISSE > 1) || defined(SHLUR)
#ifdef MODIFYPLUGIN
__asm SSE3_MOVE SSE7, [edi]
#else
__asm SSE3_MOVE SSE7, [esi + ebx + 1]
#endif
#endif // (ISSE > 1) || defined(SHLUR)
__asm paddusb SSE1, SSE0
#ifdef SHLUR
sharpen(SSE7, SSE0, SSE1, SSE4, SSE6, SSE2, SSE3)
__asm add esi, SSE_INCREMENT
__asm SSE_MOVE [edi], SSE7
#else
#if ISSE > 1
__asm pmaxub SSE0, SSE7
#else
#ifdef MODIFYPLUGIN
__asm pmaxub SSE0, [edi]
#else
__asm pmaxub SSE0, [esi + ebx + 1]
#endif
#endif // ISSE > 1
__asm pminub SSE0, SSE1
__asm add esi, SSE_INCREMENT
__asm SSE_MOVE [edi], SSE0
#endif // SHLUR
__asm add edi, SSE_INCREMENT
__asm dec ecx
__asm jnz middle_loop
// the last pixels
__asm add esi, edx
__asm add edi, edx
#if MODIFYPLUGIN > 0
__asm SSE3_MOVE SSE5, [esi + ebx + 1]
#endif
get_min_weight(SSE0, SSE1, SSE5, [esi], [esi + 2*ebx + 2], SSE7)
get_min_weight(SSE2, SSE3, SSE5, [esi + 2*ebx], [esi + 2], SSE7)
mergeweighted(SSE0, SSE1, SSE2, SSE3)
get_min_weight(SSE2, SSE3, SSE5, [esi + 2*ebx + 1], [esi + 1], SSE7)
mergeweighted(SSE0, SSE1, SSE2, SSE3)
get_min_weightw(SSE2, SSE3, SSE5, [esi + ebx], [esi + ebx + 2], [edi + 1], SSE7)
mergeweighted(SSE0, SSE1, SSE2, SSE3)
#if (ISSE > 1) || defined(SHLUR)
#ifdef MODIFYPLUGIN
__asm SSE3_MOVE SSE7, [edi]
#else
__asm SSE3_MOVE SSE7, [esi + ebx + 1]
#endif
#endif // (ISSE > 1) || defined(SHLUR)
__asm paddusb SSE1, SSE0
#ifdef SHLUR
sharpen(SSE7, SSE0, SSE1, SSE4, SSE6, SSE2, SSE3)
__asm add esi, eax
__asm SSE_MOVE [edi], SSE7
#else
#if ISSE > 1
__asm pmaxub SSE0, SSE7
#else
#ifdef MODIFYPLUGIN
__asm pmaxub SSE0, [edi]
#else
__asm pmaxub SSE0, [esi + ebx + 1]
#endif
#endif // ISSE > 1
__asm pminub SSE0, SSE1
__asm add esi, eax
__asm SSE_MOVE [edi], SSE0
#endif // SHLUR
__asm add edi, dpitch
__asm dec height
#ifdef MODIFYPLUGIN
__asm mov ecx, hblocks
__asm jnz middle_loop
#else
__asm jnz column_loop
#endif
}
#define get_val_weight(val, weight, mem, center, reg) \
__asm SSE3_MOVE val, mem \
__asm SSE_RMOVE weight, center \
__asm SSE_RMOVE reg, center \
__asm pmaxub weight, val \
__asm pminub reg, val \
__asm psubusb weight, reg
#ifdef MODIFYPLUGIN
#define get_val_weightw(val, weight, mem, center, wmem, reg) get_val_weight(val, weight, mem, center, reg)
#else
#define get_val_weightw1(val, weight, mem, center, wmem, reg) \
__asm SSE3_MOVE val, mem \
__asm SSE_RMOVE weight, center \
__asm SSE_RMOVE reg, center \
__asm pmaxub weight, val \
__asm pminub reg, val \
__asm movd wmem, val \
__asm psubusb weight, reg
#define get_val_weightw(val, weight, mem, center, wmem, reg) \
__asm SSE3_MOVE val, mem \
__asm SSE_RMOVE weight, center \
__asm SSE_RMOVE reg, center \
__asm pmaxub weight, val \
__asm pminub reg, val \
__asm SSE_MOVE wmem, val \
__asm psubusb weight, reg
#endif // MODIFYPLUGIN
void SSE_RemoveGrain10(BYTE *dp, int dpitch, const BYTE *_sp, int spitch, int hblocks, int remainder, int incpitch, int height)
{
__asm mov eax, hblocks
__asm mov ebx, spitch
#ifdef MODIFYPLUGIN
__asm mov ecx, eax
#endif
__asm mov edx, remainder
#if SSE_INCREMENT == 16
__asm add eax, eax
#endif
__asm mov esi, _sp
#ifdef MODIFYPLUGIN
__asm lea eax, [eax * 8 + edx]
#else
__asm lea eax, [eax * 8 + edx + SSE_INCREMENT + 1]
#endif
__asm sub esi, ebx
__asm sub dpitch, eax
__asm neg eax
__asm mov edi, dp
#ifdef MODIFYPLUGIN
__asm inc edi
__asm lea eax, [ebx + eax]
#else
__asm lea eax, [ebx + eax + 1]
__asm align 16
__asm column_loop:
__asm SSE3_MOVE SSE1, [esi + ebx + 1]
get_val_weightw1(SSE2, SSE3, [esi + ebx], SSE1, [edi], SSE7)
get_val_weight(SSE4, SSE5, [esi + ebx + 2], SSE1, SSE7)
mergeweighted(SSE2, SSE3, SSE4, SSE5)
get_val_weight(SSE4, SSE5, [esi], SSE1, SSE7)
mergeweighted(SSE2, SSE3, SSE4, SSE5)
get_val_weight(SSE4, SSE5, [esi + 2], SSE1, SSE7)
mergeweighted(SSE2, SSE3, SSE4, SSE5)
get_val_weight(SSE4, SSE5, [esi + 1], SSE1, SSE7)
mergeweighted(SSE2, SSE3, SSE4, SSE5)
get_val_weight(SSE4, SSE5, [esi + 2*ebx], SSE1, SSE7)
mergeweighted(SSE2, SSE3, SSE4, SSE5)
get_val_weight(SSE4, SSE5, [esi + 2*ebx + 2], SSE1, SSE7)
mergeweighted(SSE2, SSE3, SSE4, SSE5)
get_val_weight(SSE4, SSE5, [esi + 2*ebx + 1], SSE1, SSE7)
mergeweighted(SSE2, SSE3, SSE4, SSE5)
__asm SSE_MOVE SSE4, SSE2
__asm pminub SSE1, SSE2
__asm pmaxub SSE1, SSE4
__asm SSE_MOVE [edi + 1], SSE1
// now the pixels in the middle
__asm add esi, SSE_INCREMENT
__asm add edi, SSE_INCREMENT + 1
__asm mov ecx, hblocks
#endif // MODIFYPLUGIN
__asm align 16
__asm middle_loop:
#ifdef MODIFYPLUGIN
__asm SSE3_MOVE SSE1, [edi]
#else
__asm SSE3_MOVE SSE1, [esi + ebx + 1]
#endif
get_val_weight(SSE2, SSE3, [esi + ebx], SSE1, SSE7)
#if MODIFYPLUGIN == 1
get_val_weight(SSE4, SSE5, [esi + ebx + 1], SSE1, SSE7)
mergeweighted(SSE2, SSE3, SSE4, SSE5)
#endif
get_val_weightw(SSE4, SSE5, [esi + ebx + 2], SSE1, [edi + 1], SSE7)
mergeweighted(SSE2, SSE3, SSE4, SSE5)
get_val_weight(SSE4, SSE5, [esi], SSE1, SSE7)
mergeweighted(SSE2, SSE3, SSE4, SSE5)
get_val_weight(SSE4, SSE5, [esi + 2], SSE1, SSE7)
mergeweighted(SSE2, SSE3, SSE4, SSE5)
get_val_weight(SSE4, SSE5, [esi + 1], SSE1, SSE7)
mergeweighted(SSE2, SSE3, SSE4, SSE5)
get_val_weight(SSE4, SSE5, [esi + 2*ebx], SSE1, SSE7)
mergeweighted(SSE2, SSE3, SSE4, SSE5)
get_val_weight(SSE4, SSE5, [esi + 2*ebx + 2], SSE1, SSE7)
mergeweighted(SSE2, SSE3, SSE4, SSE5)
get_val_weight(SSE4, SSE5, [esi + 2*ebx + 1], SSE1, SSE7)
mergeweighted(SSE2, SSE3, SSE4, SSE5)
__asm SSE_MOVE SSE4, SSE2
__asm pminub SSE1, SSE2
__asm add esi, SSE_INCREMENT
__asm pmaxub SSE1, SSE4
__asm SSE_MOVE [edi], SSE1
__asm add edi, SSE_INCREMENT
__asm dec ecx
__asm jnz middle_loop
// the last pixels
__asm add esi, edx
__asm add edi, edx
#ifdef MODIFYPLUGIN
__asm SSE3_MOVE SSE1, [edi]
#else
__asm SSE3_MOVE SSE1, [esi + ebx + 1]
#endif
get_val_weight(SSE2, SSE3, [esi + ebx], SSE1, SSE7)
#if MODIFYPLUGIN == 1
get_val_weight(SSE4, SSE5, [esi + ebx + 1], SSE1, SSE7)
mergeweighted(SSE2, SSE3, SSE4, SSE5)
#endif
get_val_weight(SSE4, SSE5, [esi + ebx + 2], SSE1, SSE7)
mergeweighted(SSE2, SSE3, SSE4, SSE5)
get_val_weight(SSE4, SSE5, [esi], SSE1, SSE7)
mergeweighted(SSE2, SSE3, SSE4, SSE5)
get_val_weight(SSE4, SSE5, [esi + 2], SSE1, SSE7)
mergeweighted(SSE2, SSE3, SSE4, SSE5)
get_val_weight(SSE4, SSE5, [esi + 1], SSE1, SSE7)
mergeweighted(SSE2, SSE3, SSE4, SSE5)
get_val_weight(SSE4, SSE5, [esi + 2*ebx], SSE1, SSE7)
mergeweighted(SSE2, SSE3, SSE4, SSE5)
get_val_weight(SSE4, SSE5, [esi + 2*ebx + 2], SSE1, SSE7)
mergeweighted(SSE2, SSE3, SSE4, SSE5)
get_val_weight(SSE4, SSE5, [esi + 2*ebx + 1], SSE1, SSE7)
mergeweighted(SSE2, SSE3, SSE4, SSE5)
__asm SSE_MOVE SSE4, SSE2
__asm pminub SSE1, SSE2
__asm pmaxub SSE1, SSE4
__asm add esi, eax
__asm SSE_MOVE [edi], SSE1
__asm add edi, dpitch
__asm dec height
#ifdef MODIFYPLUGIN
__asm mov ecx, hblocks
__asm jnz middle_loop
#else
__asm jnz column_loop
#endif
}
#if !(defined(MODIFYPLUGIN) || defined(SHLUR))
#define neighbourdiff(minus, plus, center1, center2, neighbour, nullreg) \
__asm SSE_RMOVE center1, center2 \
__asm psubusb center2, neighbour \
__asm psubusb neighbour, center1 \
__asm SSE_RMOVE minus, center2 \
__asm SSE_RMOVE plus, neighbour \
__asm pcmpeqb center2, nullreg \
__asm pcmpeqb neighbour, nullreg \
__asm por minus, center2 \
__asm pand center2, neighbour \
__asm por plus, neighbour \
__asm psubusb minus, center2 \
__asm psubusb plus, center2
#define neighbourdiff_w(minus, plus, center1, center2, dest, neighbour, nullreg, mwrite) \
__asm SSE_RMOVE center1, center2 \
__asm mwrite dest, neighbour \
__asm psubusb center2, neighbour \
__asm psubusb neighbour, center1 \
__asm SSE_RMOVE minus, center2 \
__asm SSE_RMOVE plus, neighbour \
__asm pcmpeqb center2, nullreg \
__asm pcmpeqb neighbour, nullreg \
__asm por minus, center2 \
__asm pand center2, neighbour \
__asm por plus, neighbour \
__asm psubusb minus, center2 \
__asm psubusb plus, center2
#define SHIFT_MASK1 127
static const __declspec(align(SSE_INCREMENT)) BYTE shift_mask[SSE_INCREMENT] =
{
SHIFT_MASK1, SHIFT_MASK1, SHIFT_MASK1, SHIFT_MASK1, SHIFT_MASK1, SHIFT_MASK1, SHIFT_MASK1, SHIFT_MASK1
#if SSE_INCREMENT == 16
, SHIFT_MASK1, SHIFT_MASK1, SHIFT_MASK1, SHIFT_MASK1, SHIFT_MASK1, SHIFT_MASK1, SHIFT_MASK1, SHIFT_MASK1
#endif
};
#define sharpen(center, minus, plus, reg1, reg2)\
__asm SSE_RMOVE reg1, minus \
__asm SSE_RMOVE reg2, plus \
__asm psubusb reg1, plus \
__asm psubusb reg2, minus \
__asm psrlw plus, 1 \
__asm psrlw minus, 1 \
__asm pand plus, shift_mask \
__asm pand minus, shift_mask \
__asm pminub plus, reg1 \
__asm pminub minus, reg2 \
__asm paddusb center, plus \
__asm psubusb center, minus
void nondestructivesharpen(BYTE *dp, int dpitch, const BYTE *_sp, int spitch, int hblocks, int remainder, int incpitch, int height)
{
__asm mov eax, hblocks
__asm mov ebx, spitch
__asm mov edx, remainder
__asm pxor SSE0, SSE0
#if SSE_INCREMENT == 16
__asm add eax, eax
#endif
__asm mov esi, _sp
__asm lea eax, [eax * 8 + edx + SSE_INCREMENT + 1]
__asm sub esi, ebx
__asm sub dpitch, eax
__asm neg eax
__asm mov edi, dp
__asm lea eax, [ebx + eax + 1]
__asm align 16
__asm column_loop:
__asm SSE3_MOVE SSE1, [esi + ebx + 1]
__asm SSE3_MOVE SSE3, [esi + ebx]
neighbourdiff_w(SSE4, SSE5, SSE2, SSE1, [edi], SSE3, SSE0, movd)
__asm SSE3_MOVE SSE3, [esi + ebx + 2]
neighbourdiff(SSE6, SSE7, SSE1, SSE2, SSE3, SSE0)
__asm pminub SSE4, SSE6
__asm pminub SSE5, SSE7
__asm SSE3_MOVE SSE3, [esi]
neighbourdiff(SSE6, SSE7, SSE2, SSE1, SSE3, SSE0)
__asm pminub SSE4, SSE6
__asm pminub SSE5, SSE7
__asm SSE3_MOVE SSE3, [esi + 1]
neighbourdiff(SSE6, SSE7, SSE1, SSE2, SSE3, SSE0)
__asm pminub SSE4, SSE6
__asm pminub SSE5, SSE7
__asm SSE3_MOVE SSE3, [esi + 2]
neighbourdiff(SSE6, SSE7, SSE2, SSE1, SSE3, SSE0)
__asm pminub SSE4, SSE6
__asm pminub SSE5, SSE7
__asm SSE3_MOVE SSE3, [esi + 2*ebx]
neighbourdiff(SSE6, SSE7, SSE1, SSE2, SSE3, SSE0)
__asm pminub SSE4, SSE6
__asm pminub SSE5, SSE7
__asm SSE3_MOVE SSE3, [esi + 2*ebx + 1]
neighbourdiff(SSE6, SSE7, SSE2, SSE1, SSE3, SSE0)
__asm pminub SSE4, SSE6
__asm pminub SSE5, SSE7
__asm SSE3_MOVE SSE3, [esi + 2*ebx + 2]
neighbourdiff(SSE6, SSE7, SSE1, SSE2, SSE3, SSE0)
__asm pminub SSE4, SSE6
__asm pminub SSE5, SSE7
sharpen(SSE1, SSE4, SSE5, SSE6, SSE7)
__asm SSE_MOVE [edi + 1], SSE1
// now the pixels in the middle
__asm add esi, SSE_INCREMENT
__asm add edi, SSE_INCREMENT + 1
__asm mov ecx, hblocks
__asm align 16
__asm middle_loop:
__asm SSE3_MOVE SSE1, [esi + ebx + 1]
__asm SSE3_MOVE SSE3, [esi + ebx]
neighbourdiff(SSE4, SSE5, SSE2, SSE1, SSE3, SSE0)
__asm SSE3_MOVE SSE3, [esi + ebx + 2]
neighbourdiff(SSE6, SSE7, SSE1, SSE2, SSE3, SSE0)
__asm pminub SSE4, SSE6
__asm pminub SSE5, SSE7
__asm SSE3_MOVE SSE3, [esi]
neighbourdiff(SSE6, SSE7, SSE2, SSE1, SSE3, SSE0)
__asm pminub SSE4, SSE6
__asm pminub SSE5, SSE7
__asm SSE3_MOVE SSE3, [esi + 1]
neighbourdiff(SSE6, SSE7, SSE1, SSE2, SSE3, SSE0)
__asm pminub SSE4, SSE6
__asm pminub SSE5, SSE7
__asm SSE3_MOVE SSE3, [esi + 2]
neighbourdiff(SSE6, SSE7, SSE2, SSE1, SSE3, SSE0)
__asm pminub SSE4, SSE6
__asm pminub SSE5, SSE7
__asm SSE3_MOVE SSE3, [esi + 2*ebx]
neighbourdiff(SSE6, SSE7, SSE1, SSE2, SSE3, SSE0)
__asm pminub SSE4, SSE6
__asm pminub SSE5, SSE7
__asm SSE3_MOVE SSE3, [esi + 2*ebx + 1]
neighbourdiff(SSE6, SSE7, SSE2, SSE1, SSE3, SSE0)
__asm pminub SSE4, SSE6
__asm pminub SSE5, SSE7
__asm SSE3_MOVE SSE3, [esi + 2*ebx + 2]
neighbourdiff(SSE6, SSE7, SSE1, SSE2, SSE3, SSE0)
__asm pminub SSE4, SSE6
__asm pminub SSE5, SSE7
__asm add esi, SSE_INCREMENT
sharpen(SSE1, SSE4, SSE5, SSE6, SSE7)
__asm SSE_MOVE [edi], SSE1
__asm add edi, SSE_INCREMENT
__asm dec ecx
__asm jnz middle_loop
// the last pixels
__asm add esi, edx
__asm add edi, edx
__asm SSE3_MOVE SSE1, [esi + ebx + 1]
__asm SSE3_MOVE SSE3, [esi + ebx]
neighbourdiff(SSE4, SSE5, SSE2, SSE1, SSE3, SSE0)
__asm SSE3_MOVE SSE3, [esi + ebx + 2]
neighbourdiff_w(SSE6, SSE7, SSE1, SSE2, [edi + 1], SSE3, SSE0, SSE_MOVE)
__asm pminub SSE4, SSE6
__asm pminub SSE5, SSE7
__asm SSE3_MOVE SSE3, [esi]
neighbourdiff(SSE6, SSE7, SSE2, SSE1, SSE3, SSE0)
__asm pminub SSE4, SSE6
__asm pminub SSE5, SSE7
__asm SSE3_MOVE SSE3, [esi + 1]
neighbourdiff(SSE6, SSE7, SSE1, SSE2, SSE3, SSE0)
__asm pminub SSE4, SSE6
__asm pminub SSE5, SSE7
__asm SSE3_MOVE SSE3, [esi + 2]
neighbourdiff(SSE6, SSE7, SSE2, SSE1, SSE3, SSE0)
__asm pminub SSE4, SSE6
__asm pminub SSE5, SSE7
__asm SSE3_MOVE SSE3, [esi + 2*ebx]
neighbourdiff(SSE6, SSE7, SSE1, SSE2, SSE3, SSE0)
__asm pminub SSE4, SSE6
__asm pminub SSE5, SSE7
__asm SSE3_MOVE SSE3, [esi + 2*ebx + 1]
neighbourdiff(SSE6, SSE7, SSE2, SSE1, SSE3, SSE0)
__asm pminub SSE4, SSE6
__asm pminub SSE5, SSE7
__asm SSE3_MOVE SSE3, [esi + 2*ebx + 2]
neighbourdiff(SSE6, SSE7, SSE1, SSE2, SSE3, SSE0)
__asm pminub SSE4, SSE6
__asm pminub SSE5, SSE7
__asm add esi, eax
sharpen(SSE1, SSE4, SSE5, SSE6, SSE7)
__asm SSE_MOVE [edi], SSE1
__asm add edi, dpitch
__asm dec height
__asm jnz column_loop
}
#endif !(defined(MODIFYPLUGIN) || defined(SHLUR))
#ifndef MODIFYPLUGIN
#define convolution(daddr, saddr, spitch, nullreg, bias_correction, reg0, reg1, reg2, reg3, reg4, reg5) \
__asm SSE3_MOVE reg0, [saddr + spitch + 1] \
__asm SSE_MOVE reg1, reg0 \
__asm punpcklbw reg0, nullreg \
__asm punpckhbw reg1, nullreg \
__asm SSE3_MOVE reg2, [saddr + spitch] \
__asm paddusw reg0, reg0 \
__asm SSE_MOVE reg3, reg2 \
__asm paddusw reg1, reg1 \
__asm punpcklbw reg2, nullreg \
__asm punpckhbw reg3, nullreg \
__asm SSE3_MOVE reg4, [saddr + spitch + 2] \
__asm paddusw reg0, reg2 \
__asm SSE_MOVE reg5, reg4 \
__asm paddusw reg1, reg3 \
__asm punpcklbw reg4, nullreg \
__asm punpckhbw reg5, nullreg \
__asm SSE3_MOVE reg2, [saddr + 1] \
__asm paddusw reg0, reg4 \
__asm SSE_MOVE reg3, reg2 \
__asm paddusw reg1, reg5 \
__asm punpcklbw reg2, nullreg \
__asm punpckhbw reg3, nullreg \
__asm SSE3_MOVE reg4, [saddr + 2*spitch + 1] \
__asm paddusw reg0, reg2 \
__asm SSE_MOVE reg5, reg4 \
__asm paddusw reg1, reg3 \
__asm punpcklbw reg4, nullreg \
__asm punpckhbw reg5, nullreg \
__asm SSE3_MOVE reg2, [saddr] \
__asm paddusw reg0, reg4 \
__asm SSE_MOVE reg3, reg2 \
__asm paddusw reg1, reg5 \
__asm punpcklbw reg2, nullreg \
__asm paddusw reg0, reg0 \
__asm paddusw reg1, reg1 \
__asm punpckhbw reg3, nullreg \
__asm SSE3_MOVE reg4, [saddr + 2] \
__asm paddusw reg0, reg2 \
__asm SSE_MOVE reg5, reg4 \
__asm paddusw reg1, reg3 \
__asm punpcklbw reg4, nullreg \
__asm punpckhbw reg5, nullreg \
__asm SSE3_MOVE reg2, [saddr + 2*spitch] \
__asm paddusw reg0, reg4 \
__asm SSE_MOVE reg3, reg2 \
__asm paddusw reg1, reg5 \
__asm punpcklbw reg2, nullreg \
__asm punpckhbw reg3, nullreg \
__asm SSE3_MOVE reg4, [saddr + 2*spitch + 2] \
__asm paddusw reg0, reg2 \
__asm SSE_MOVE reg5, reg4 \
__asm paddusw reg1, reg3 \
__asm punpcklbw reg4, nullreg \
__asm punpckhbw reg5, nullreg \
__asm paddusw reg0, reg4 \
__asm paddusw reg1, reg5 \
__asm paddusw reg0, bias_correction \
__asm paddusw reg1, bias_correction \
__asm psraw reg0, 4 \
__asm psraw reg1, 4 \
__asm packuswb reg0, reg1 \
__asm SSE_MOVE [daddr], reg0
#define convolution_w1(daddr, saddr, spitch, nullreg, bias_correction, reg0, reg1, reg2, reg3, reg4, reg5) \
__asm SSE3_MOVE reg0, [saddr + spitch + 1] \
__asm SSE_MOVE reg1, reg0 \
__asm punpcklbw reg0, nullreg \
__asm punpckhbw reg1, nullreg \
__asm SSE3_MOVE reg2, [saddr + spitch] \
__asm paddusw reg0, reg0 \
__asm SSE_MOVE reg3, reg2 \
__asm movd [daddr], reg2 \
__asm paddusw reg1, reg1 \
__asm punpcklbw reg2, nullreg \
__asm punpckhbw reg3, nullreg \
__asm SSE3_MOVE reg4, [saddr + spitch + 2] \
__asm paddusw reg0, reg2 \
__asm SSE_MOVE reg5, reg4 \
__asm paddusw reg1, reg3 \
__asm punpcklbw reg4, nullreg \
__asm punpckhbw reg5, nullreg \
__asm SSE3_MOVE reg2, [saddr + 1] \
__asm paddusw reg0, reg4 \
__asm SSE_MOVE reg3, reg2 \
__asm paddusw reg1, reg5 \
__asm punpcklbw reg2, nullreg \
__asm punpckhbw reg3, nullreg \
__asm SSE3_MOVE reg4, [saddr + 2*spitch + 1] \
__asm paddusw reg0, reg2 \
__asm SSE_MOVE reg5, reg4 \
__asm paddusw reg1, reg3 \
__asm punpcklbw reg4, nullreg \
__asm punpckhbw reg5, nullreg \
__asm SSE3_MOVE reg2, [saddr] \
__asm paddusw reg0, reg4 \
__asm SSE_MOVE reg3, reg2 \
__asm paddusw reg1, reg5 \
__asm punpcklbw reg2, nullreg \
__asm paddusw reg0, reg0 \
__asm paddusw reg1, reg1 \
__asm punpckhbw reg3, nullreg \
__asm SSE3_MOVE reg4, [saddr + 2] \
__asm paddusw reg0, reg2 \
__asm SSE_MOVE reg5, reg4 \
__asm paddusw reg1, reg3 \
__asm punpcklbw reg4, nullreg \
__asm punpckhbw reg5, nullreg \
__asm SSE3_MOVE reg2, [saddr + 2*spitch] \
__asm paddusw reg0, reg4 \
__asm SSE_MOVE reg3, reg2 \
__asm paddusw reg1, reg5 \
__asm punpcklbw reg2, nullreg \
__asm punpckhbw reg3, nullreg \
__asm SSE3_MOVE reg4, [saddr + 2*spitch + 2] \
__asm paddusw reg0, reg2 \
__asm SSE_MOVE reg5, reg4 \
__asm paddusw reg1, reg3 \
__asm punpcklbw reg4, nullreg \
__asm punpckhbw reg5, nullreg \
__asm paddusw reg0, reg4 \
__asm paddusw reg1, reg5 \
__asm paddusw reg0, bias_correction \
__asm paddusw reg1, bias_correction \
__asm psraw reg0, 4 \
__asm psraw reg1, 4 \
__asm packuswb reg0, reg1 \
__asm SSE_MOVE [daddr + 1], reg0
#define convolution_w2(daddr, saddr, spitch, nullreg, bias_correction, reg0, reg1, reg2, reg3, reg4, reg5) \
__asm SSE3_MOVE reg0, [saddr + spitch + 1] \
__asm SSE_MOVE reg1, reg0 \
__asm punpcklbw reg0, nullreg \
__asm punpckhbw reg1, nullreg \
__asm SSE3_MOVE reg2, [saddr + spitch] \
__asm paddusw reg0, reg0 \
__asm SSE_MOVE reg3, reg2 \
__asm paddusw reg1, reg1 \
__asm punpcklbw reg2, nullreg \
__asm punpckhbw reg3, nullreg \
__asm SSE3_MOVE reg4, [saddr + spitch + 2] \
__asm paddusw reg0, reg2 \
__asm SSE_MOVE reg5, reg4 \
__asm SSE_MOVE [daddr + 1], reg4 \
__asm paddusw reg1, reg3 \
__asm punpcklbw reg4, nullreg \
__asm punpckhbw reg5, nullreg \
__asm SSE3_MOVE reg2, [saddr + 1] \
__asm paddusw reg0, reg4 \
__asm SSE_MOVE reg3, reg2 \
__asm paddusw reg1, reg5 \
__asm punpcklbw reg2, nullreg \
__asm punpckhbw reg3, nullreg \
__asm SSE3_MOVE reg4, [saddr + 2*spitch + 1] \
__asm paddusw reg0, reg2 \
__asm SSE_MOVE reg5, reg4 \
__asm paddusw reg1, reg3 \
__asm punpcklbw reg4, nullreg \
__asm punpckhbw reg5, nullreg \
__asm SSE3_MOVE reg2, [saddr] \
__asm paddusw reg0, reg4 \
__asm SSE_MOVE reg3, reg2 \
__asm paddusw reg1, reg5 \
__asm punpcklbw reg2, nullreg \
__asm paddusw reg0, reg0 \
__asm paddusw reg1, reg1 \
__asm punpckhbw reg3, nullreg \
__asm SSE3_MOVE reg4, [saddr + 2] \
__asm paddusw reg0, reg2 \
__asm SSE_MOVE reg5, reg4 \
__asm paddusw reg1, reg3 \
__asm punpcklbw reg4, nullreg \
__asm punpckhbw reg5, nullreg \
__asm SSE3_MOVE reg2, [saddr + 2*spitch] \
__asm paddusw reg0, reg4 \
__asm SSE_MOVE reg3, reg2 \
__asm paddusw reg1, reg5 \
__asm punpcklbw reg2, nullreg \
__asm punpckhbw reg3, nullreg \
__asm SSE3_MOVE reg4, [saddr + 2*spitch + 2] \
__asm paddusw reg0, reg2 \
__asm SSE_MOVE reg5, reg4 \
__asm paddusw reg1, reg3 \
__asm punpcklbw reg4, nullreg \
__asm punpckhbw reg5, nullreg \
__asm paddusw reg0, reg4 \
__asm paddusw reg1, reg5 \
__asm paddusw reg0, bias_correction \
__asm paddusw reg1, bias_correction \
__asm psraw reg0, 4 \
__asm psraw reg1, 4 \
__asm packuswb reg0, reg1 \
__asm SSE_MOVE [daddr], reg0
static const __declspec(align(SSE_INCREMENT)) unsigned short convolution_bias[SSE_INCREMENT/2] =
{
8,8,8,8
#if SSE_INCREMENT == 16
,8,8,8,8
#endif
};
void SSE_RemoveGrain11(BYTE *dp, int dpitch, const BYTE *_sp, int spitch, int hblocks, int remainder, int incpitch, int height)
{
#ifdef CVERSION
_sp -= spitch;
int width = (hblocks + 2) * SSE_INCREMENT + remainder;
int spitch2 = spitch - width;
dpitch -= width;
do
{
int w = width;
dp[0] = _sp[spitch];
do
{
*++dp = (2*(_sp[spitch] + 2 * _sp[spitch + 1] + _sp[spitch + 2] + _sp[1] + _sp[2 * spitch + 1])
+ _sp[0] + _sp[2] + _sp[2 * spitch] + _sp[2 * spitch + 2] + 8) / 16;
++_sp;
} while( --w );
dp[1] = _sp[spitch + 1];
dp += dpitch;
_sp += spitch2;
} while( --height );
#else
__asm SSE_RMOVE SSE7, convolution_bias
__asm mov eax, hblocks
__asm mov ebx, spitch
__asm pxor SSE6, SSE6
#ifdef MODIFYPLUGIN
__asm mov ecx, eax
#endif
__asm mov edx, remainder
#if SSE_INCREMENT == 16
__asm add eax, eax
#endif
__asm mov esi, _sp
#ifdef MODIFYPLUGIN
__asm lea eax, [eax * 8 + edx]
#else
__asm lea eax, [eax * 8 + edx + SSE_INCREMENT + 1]
#endif
__asm sub esi, ebx
__asm sub dpitch, eax
__asm neg eax
__asm mov edi, dp
#ifdef MODIFYPLUGIN
__asm inc edi
__asm lea eax, [ebx + eax]
#else
__asm lea eax, [ebx + eax + 1]
__asm align 16
__asm column_loop:
convolution_w1(edi, esi, ebx, SSE6, SSE7, SSE0, SSE1, SSE2, SSE3, SSE4, SSE5)
// now the pixels in the middle
__asm add esi, SSE_INCREMENT
__asm add edi, SSE_INCREMENT + 1
__asm mov ecx, hblocks
#endif // MODIFYPLUGIN
__asm align 16
__asm middle_loop:
convolution(edi, esi, ebx, SSE6, SSE7, SSE0, SSE1, SSE2, SSE3, SSE4, SSE5)
__asm add esi, SSE_INCREMENT
__asm add edi, SSE_INCREMENT
__asm dec ecx
__asm jnz middle_loop
// the last pixels
__asm add esi, edx
__asm add edi, edx
convolution_w2(edi, esi, ebx, SSE6, SSE7, SSE0, SSE1, SSE2, SSE3, SSE4, SSE5)
__asm add esi, eax
__asm add edi, dpitch
__asm dec height
#ifdef MODIFYPLUGIN
__asm mov ecx, hblocks
__asm jnz middle_loop
#else
__asm jnz column_loop
#endif
#endif
}
#define flatconvolution(daddr, saddr, spitch, nullreg, onenineth, reg0, reg1, reg2, reg3, reg4, reg5) \
__asm SSE3_MOVE reg0, [saddr] \
__asm SSE3_MOVE reg1, [saddr + 1] \
__asm SSE_RMOVE reg2, reg0 \
__asm SSE_RMOVE reg3, reg1 \
__asm punpcklbw reg0, nullreg \
__asm punpcklbw reg1, nullreg \
__asm punpckhbw reg2, nullreg \
__asm punpckhbw reg3, nullreg \
__asm paddusw reg0, reg1 \
__asm paddusw reg2, reg3 \
__asm SSE3_MOVE reg4, [saddr + 2] \
__asm SSE3_MOVE reg1, [saddr + ebx] \
__asm SSE_RMOVE reg5, reg4 \
__asm SSE_RMOVE reg3, reg1 \
__asm punpcklbw reg4, nullreg \
__asm punpcklbw reg1, nullreg \
__asm punpckhbw reg5, nullreg \
__asm punpckhbw reg3, nullreg \
__asm paddusw reg0, reg4 \
__asm paddusw reg2, reg5 \
__asm paddusw reg0, reg1 \
__asm paddusw reg2, reg3 \
__asm SSE3_MOVE reg4, [saddr + ebx + 1] \
__asm SSE3_MOVE reg1, [saddr + ebx + 2] \
__asm SSE_RMOVE reg5, reg4 \
__asm SSE_RMOVE reg3, reg1 \
__asm punpcklbw reg4, nullreg \
__asm punpcklbw reg1, nullreg \
__asm punpckhbw reg5, nullreg \
__asm punpckhbw reg3, nullreg \
__asm paddusw reg0, reg4 \
__asm paddusw reg2, reg5 \
__asm paddusw reg0, reg1 \
__asm paddusw reg2, reg3 \
__asm SSE3_MOVE reg4, [saddr + 2*ebx] \
__asm SSE3_MOVE reg1, [saddr + 2*ebx + 1] \
__asm SSE_RMOVE reg5, reg4 \
__asm SSE_RMOVE reg3, reg1 \
__asm punpcklbw reg4, nullreg \
__asm punpcklbw reg1, nullreg \
__asm punpckhbw reg5, nullreg \
__asm punpckhbw reg3, nullreg \
__asm paddusw reg0, reg4 \
__asm paddusw reg2, reg5 \
__asm paddusw reg0, reg1 \
__asm paddusw reg2, reg3 \
__asm SSE3_MOVE reg1, [saddr + 2*ebx + 2] \
__asm SSE_RMOVE reg3, reg1 \
__asm punpcklbw reg1, nullreg \
__asm punpckhbw reg3, nullreg \
__asm paddusw reg0, reg1 \
__asm paddusw reg2, reg3 \
__asm paddusw reg0, flatconvolution_bias \
__asm paddusw reg2, flatconvolution_bias \
__asm pmulhuw reg0, onenineth \
__asm pmulhuw reg2, onenineth \
__asm packuswb reg0, reg2 \
__asm SSE_MOVE [daddr], reg0
#define flatconvolution_w1(daddr, saddr, spitch, nullreg, onenineth, reg0, reg1, reg2, reg3, reg4, reg5) \
__asm SSE3_MOVE reg0, [saddr] \
__asm SSE3_MOVE reg1, [saddr + 1] \
__asm SSE_RMOVE reg2, reg0 \
__asm SSE_RMOVE reg3, reg1 \
__asm punpcklbw reg0, nullreg \
__asm punpcklbw reg1, nullreg \
__asm punpckhbw reg2, nullreg \
__asm punpckhbw reg3, nullreg \
__asm paddusw reg0, reg1 \
__asm paddusw reg2, reg3 \
__asm SSE3_MOVE reg4, [saddr + 2] \
__asm SSE3_MOVE reg1, [saddr + ebx] \
__asm SSE_RMOVE reg5, reg4 \
__asm movd [daddr], reg1 \
__asm SSE_RMOVE reg3, reg1 \
__asm punpcklbw reg4, nullreg \
__asm punpcklbw reg1, nullreg \
__asm punpckhbw reg5, nullreg \
__asm punpckhbw reg3, nullreg \
__asm paddusw reg0, reg4 \
__asm paddusw reg2, reg5 \
__asm paddusw reg0, reg1 \
__asm paddusw reg2, reg3 \
__asm SSE3_MOVE reg4, [saddr + ebx + 1] \
__asm SSE3_MOVE reg1, [saddr + ebx + 2] \
__asm SSE_RMOVE reg5, reg4 \
__asm SSE_RMOVE reg3, reg1 \
__asm punpcklbw reg4, nullreg \
__asm punpcklbw reg1, nullreg \
__asm punpckhbw reg5, nullreg \
__asm punpckhbw reg3, nullreg \
__asm paddusw reg0, reg4 \
__asm paddusw reg2, reg5 \
__asm paddusw reg0, reg1 \
__asm paddusw reg2, reg3 \
__asm SSE3_MOVE reg4, [saddr + 2*ebx] \
__asm SSE3_MOVE reg1, [saddr + 2*ebx + 1] \
__asm SSE_RMOVE reg5, reg4 \
__asm SSE_RMOVE reg3, reg1 \
__asm punpcklbw reg4, nullreg \
__asm punpcklbw reg1, nullreg \
__asm punpckhbw reg5, nullreg \
__asm punpckhbw reg3, nullreg \
__asm paddusw reg0, reg4 \
__asm paddusw reg2, reg5 \
__asm paddusw reg0, reg1 \
__asm paddusw reg2, reg3 \
__asm SSE3_MOVE reg1, [saddr + 2*ebx + 2] \
__asm SSE_RMOVE reg3, reg1 \
__asm punpcklbw reg1, nullreg \
__asm punpckhbw reg3, nullreg \
__asm paddusw reg0, reg1 \
__asm paddusw reg2, reg3 \
__asm paddusw reg0, flatconvolution_bias \
__asm paddusw reg2, flatconvolution_bias \
__asm pmulhuw reg0, onenineth \
__asm pmulhuw reg2, onenineth \
__asm packuswb reg0, reg2 \
__asm SSE_MOVE [daddr + 1], reg0
#define flatconvolution_w2(daddr, saddr, spitch, nullreg, onenineth, reg0, reg1, reg2, reg3, reg4, reg5) \
__asm SSE3_MOVE reg0, [saddr] \
__asm SSE3_MOVE reg1, [saddr + 1] \
__asm SSE_RMOVE reg2, reg0 \
__asm SSE_RMOVE reg3, reg1 \
__asm punpcklbw reg0, nullreg \
__asm punpcklbw reg1, nullreg \
__asm punpckhbw reg2, nullreg \
__asm punpckhbw reg3, nullreg \
__asm paddusw reg0, reg1 \
__asm paddusw reg2, reg3 \
__asm SSE3_MOVE reg4, [saddr + 2] \
__asm SSE3_MOVE reg1, [saddr + ebx] \
__asm SSE_RMOVE reg5, reg4 \
__asm SSE_RMOVE reg3, reg1 \
__asm punpcklbw reg4, nullreg \
__asm punpcklbw reg1, nullreg \
__asm punpckhbw reg5, nullreg \
__asm punpckhbw reg3, nullreg \
__asm paddusw reg0, reg4 \
__asm paddusw reg2, reg5 \
__asm paddusw reg0, reg1 \
__asm paddusw reg2, reg3 \
__asm SSE3_MOVE reg4, [saddr + ebx + 1] \
__asm SSE3_MOVE reg1, [saddr + ebx + 2] \
__asm SSE_RMOVE reg5, reg4 \
__asm SSE_MOVE [daddr + 1], reg1 \
__asm SSE_RMOVE reg3, reg1 \
__asm punpcklbw reg4, nullreg \
__asm punpcklbw reg1, nullreg \
__asm punpckhbw reg5, nullreg \
__asm punpckhbw reg3, nullreg \
__asm paddusw reg0, reg4 \
__asm paddusw reg2, reg5 \
__asm paddusw reg0, reg1 \
__asm paddusw reg2, reg3 \
__asm SSE3_MOVE reg4, [saddr + 2*ebx] \
__asm SSE3_MOVE reg1, [saddr + 2*ebx + 1] \
__asm SSE_RMOVE reg5, reg4 \
__asm SSE_RMOVE reg3, reg1 \
__asm punpcklbw reg4, nullreg \
__asm punpcklbw reg1, nullreg \
__asm punpckhbw reg5, nullreg \
__asm punpckhbw reg3, nullreg \
__asm paddusw reg0, reg4 \
__asm paddusw reg2, reg5 \
__asm paddusw reg0, reg1 \
__asm paddusw reg2, reg3 \
__asm SSE3_MOVE reg1, [saddr + 2*ebx + 2] \
__asm SSE_RMOVE reg3, reg1 \
__asm punpcklbw reg1, nullreg \
__asm punpckhbw reg3, nullreg \
__asm paddusw reg0, reg1 \
__asm paddusw reg2, reg3 \
__asm paddusw reg0, flatconvolution_bias \
__asm paddusw reg2, flatconvolution_bias \
__asm pmulhuw reg0, onenineth \
__asm pmulhuw reg2, onenineth \
__asm packuswb reg0, reg2 \
__asm SSE_MOVE [daddr], reg0
#define FLATBIAS 4
static const __declspec(align(SSE_INCREMENT)) unsigned short flatconvolution_bias[SSE_INCREMENT/2] =
{
FLATBIAS, FLATBIAS, FLATBIAS, FLATBIAS
#if SSE_INCREMENT == 16
, FLATBIAS, FLATBIAS, FLATBIAS, FLATBIAS
#endif
};
#define ONENINETH (unsigned short)(((1u << 16) + 4) / 9)
static const __declspec(align(SSE_INCREMENT)) unsigned short onenineth[SSE_INCREMENT/2] =
{
ONENINETH, ONENINETH, ONENINETH, ONENINETH
#if SSE_INCREMENT == 16
, ONENINETH, ONENINETH, ONENINETH, ONENINETH
#endif
};
void SSE_RemoveGrain20(BYTE *dp, int dpitch, const BYTE *_sp, int spitch, int hblocks, int remainder, int incpitch, int height)
{
#ifdef CVERSION
_sp -= spitch;
int width = (hblocks + 2) * SSE_INCREMENT + remainder;
int spitch2 = spitch - width;
dpitch -= width;
do
{
int w = width;
dp[0] = _sp[spitch];
do
{
*++dp = (BYTE)((_sp[0] + _sp[1] + _sp[2] + _sp[spitch] + _sp[spitch + 1] + _sp[spitch + 2] + _sp[2 * spitch]
+ _sp[2 * spitch + 1] + _sp[2 * spitch + 2] + 4) / 9);
++_sp;
} while( --w );
dp[1] = _sp[spitch + 1];
dp += dpitch;
_sp += spitch2;
} while( --height );
#else
__asm SSE_RMOVE SSE7, onenineth
__asm mov eax, hblocks
__asm mov ebx, spitch
__asm pxor SSE6, SSE6
#ifdef MODIFYPLUGIN
__asm mov ecx, eax
#endif
__asm mov edx, remainder
#if SSE_INCREMENT == 16
__asm add eax, eax
#endif
__asm mov esi, _sp
#ifdef MODIFYPLUGIN
__asm lea eax, [eax * 8 + edx]
#else
__asm lea eax, [eax * 8 + edx + SSE_INCREMENT + 1]
#endif
__asm sub esi, ebx
__asm sub dpitch, eax
__asm neg eax
__asm mov edi, dp
#ifdef MODIFYPLUGIN
__asm inc edi
__asm lea eax, [ebx + eax]
#else
__asm lea eax, [ebx + eax + 1]
__asm align 16
__asm column_loop:
flatconvolution_w1(edi, esi, ebx, SSE6, SSE7, SSE0, SSE1, SSE2, SSE3, SSE4, SSE5)
// now the pixels in the middle
__asm add esi, SSE_INCREMENT
__asm add edi, SSE_INCREMENT + 1
__asm mov ecx, hblocks
#endif // MODIFYPLUGIN
__asm align 16
__asm middle_loop:
flatconvolution(edi, esi, ebx, SSE6, SSE7, SSE0, SSE1, SSE2, SSE3, SSE4, SSE5)
__asm add esi, SSE_INCREMENT
__asm add edi, SSE_INCREMENT
__asm dec ecx
__asm jnz middle_loop
// the last pixels
__asm add esi, edx
__asm add edi, edx
flatconvolution_w2(edi, esi, ebx, SSE6, SSE7, SSE0, SSE1, SSE2, SSE3, SSE4, SSE5)
__asm add esi, eax
__asm add edi, dpitch
__asm dec height
#ifdef MODIFYPLUGIN
__asm mov ecx, hblocks
__asm jnz middle_loop
#else
__asm jnz column_loop
#endif
#endif
}
#if ISSE > 1
#define fconvolution(daddr, saddr, spitch, nullreg, bias_correction, reg0, reg1, reg2, reg3, reg4, reg5)\
__asm SSE3_MOVE reg0, [saddr] \
__asm SSE3_MOVE reg1, [saddr + 2] \
__asm SSE3_MOVE reg2, [saddr + 2*spitch] \
__asm SSE3_MOVE reg3, [saddr + 2*spitch + 2]\
__asm pavgb reg0, reg1 \
__asm pavgb reg2, reg3 \
__asm SSE3_MOVE reg1, [saddr + 1] \
__asm SSE3_MOVE reg3, [saddr + 2*spitch + 1]\
__asm pavgb reg0, reg1 \
__asm SSE3_MOVE reg4, [saddr + spitch] \
__asm SSE3_MOVE reg5, [saddr + spitch + 2]\
__asm pavgb reg2, reg3 \
__asm SSE3_MOVE reg1, [saddr + spitch + 1]\
__asm pavgb reg0, reg2 \
__asm pavgb reg4, reg5 \
__asm psubusb reg0, bias_correction \
__asm pavgb reg1, reg4 \
__asm pavgb reg0, reg1 \
__asm SSE_MOVE [daddr], reg0
#define fconvolution_w1(daddr, saddr, spitch, nullreg, bias_correction, reg0, reg1, reg2, reg3, reg4, reg5)\
__asm SSE3_MOVE reg0, [saddr] \
__asm SSE3_MOVE reg1, [saddr + 2] \
__asm SSE3_MOVE reg2, [saddr + 2*spitch] \
__asm SSE3_MOVE reg3, [saddr + 2*spitch + 2]\
__asm pavgb reg0, reg1 \
__asm pavgb reg2, reg3 \
__asm SSE3_MOVE reg1, [saddr + 1] \
__asm SSE3_MOVE reg3, [saddr + 2*spitch + 1]\
__asm pavgb reg0, reg1 \
__asm SSE3_MOVE reg4, [saddr + spitch] \
__asm SSE3_MOVE reg5, [saddr + spitch + 2]\
__asm pavgb reg2, reg3 \
__asm movd [daddr], reg4 \
__asm SSE3_MOVE reg1, [saddr + spitch + 1]\
__asm pavgb reg0, reg2 \
__asm pavgb reg4, reg5 \
__asm psubusb reg0, bias_correction \
__asm pavgb reg1, reg4 \
__asm pavgb reg0, reg1 \
__asm SSE_MOVE [daddr + 1], reg0
#define fconvolution_w2(daddr, saddr, spitch, nullreg, bias_correction, reg0, reg1, reg2, reg3, reg4, reg5)\
__asm SSE3_MOVE reg0, [saddr] \
__asm SSE3_MOVE reg1, [saddr + 2] \
__asm SSE3_MOVE reg2, [saddr + 2*spitch] \
__asm SSE3_MOVE reg3, [saddr + 2*spitch + 2]\
__asm pavgb reg0, reg1 \
__asm pavgb reg2, reg3 \
__asm SSE3_MOVE reg1, [saddr + 1] \
__asm SSE3_MOVE reg3, [saddr + 2*spitch + 1]\
__asm pavgb reg0, reg1 \
__asm SSE3_MOVE reg4, [saddr + spitch] \
__asm SSE3_MOVE reg5, [saddr + spitch + 2]\
__asm pavgb reg2, reg3 \
__asm SSE3_MOVE reg1, [saddr + spitch + 1]\
__asm pavgb reg0, reg2 \
__asm pavgb reg4, reg5 \
__asm psubusb reg0, bias_correction \
__asm pavgb reg1, reg4 \
__asm SSE_MOVE [daddr + 1], reg5 \
__asm pavgb reg0, reg1 \
__asm SSE_MOVE [daddr], reg0
#else
#define fconvolution(daddr, saddr, spitch, nullreg, bias_correction, reg0, reg1, reg2, reg3, reg4, reg5) \
__asm SSE3_MOVE reg0, [saddr] \
__asm SSE3_MOVE reg2, [saddr + 2*spitch] \
__asm pavgb reg0, [saddr + 2] \
__asm pavgb reg2, [saddr + 2*spitch + 2]\
__asm pavgb reg0, [saddr + 1] \
__asm pavgb reg2, [saddr + 2*spitch + 1]\
__asm SSE3_MOVE reg4, [saddr + spitch] \
__asm pavgb reg0, reg2 \
__asm pavgb reg4, [saddr + spitch + 2]\
__asm psubusb reg0, bias_correction \
__asm pavgb reg4, [saddr + spitch + 1]\
__asm pavgb reg0, reg4 \
__asm SSE_MOVE [daddr], reg0
#define fconvolution_w1(daddr, saddr, spitch, nullreg, bias_correction, reg0, reg1, reg2, reg3, reg4, reg5) \
__asm SSE3_MOVE reg0, [saddr] \
__asm SSE3_MOVE reg2, [saddr + 2*spitch] \
__asm pavgb reg0, [saddr + 2] \
__asm pavgb reg2, [saddr + 2*spitch + 2]\
__asm pavgb reg0, [saddr + 1] \
__asm pavgb reg2, [saddr + 2*spitch + 1]\
__asm SSE3_MOVE reg4, [saddr + spitch] \
__asm pavgb reg0, reg2 \
__asm movd [daddr], reg4 \
__asm pavgb reg4, [saddr + spitch + 2]\
__asm psubusb reg0, bias_correction \
__asm pavgb reg4, [saddr + spitch + 1]\
__asm pavgb reg0, reg4 \
__asm SSE_MOVE [daddr + 1], reg0
#define fconvolution_w2(daddr, saddr, spitch, nullreg, bias_correction, reg0, reg1, reg2, reg3, reg4, reg5) \
__asm SSE3_MOVE reg0, [saddr] \
__asm SSE3_MOVE reg2, [saddr + 2*spitch] \
__asm pavgb reg0, [saddr + 2] \
__asm pavgb reg2, [saddr + 2*spitch + 2]\
__asm pavgb reg0, [saddr + 1] \
__asm pavgb reg2, [saddr + 2*spitch + 1]\
__asm SSE3_MOVE reg4, [saddr + spitch + 2]\
__asm pavgb reg0, reg2 \
__asm SSE_MOVE [daddr + 1], reg4 \
__asm pavgb reg4, [saddr + spitch] \
__asm psubusb reg0, bias_correction \
__asm pavgb reg4, [saddr + spitch + 1]\
__asm pavgb reg0, reg4 \
__asm SSE_MOVE [daddr], reg0
#endif // ISSE
static const __declspec(align(SSE_INCREMENT)) unsigned char fconvolution_bias[SSE_INCREMENT] =
{
1,1,1,1,1,1,1,1
#if SSE_INCREMENT == 16
,1,1,1,1,1,1,1,1
#endif
};
void SSE_RemoveGrain12(BYTE *dp, int dpitch, const BYTE *_sp, int spitch, int hblocks, int remainder, int incpitch, int height)
{
#ifdef CVERSION
_sp -= spitch;
int width = (hblocks + 2) * SSE_INCREMENT + remainder;
int spitch2 = spitch - width;
dpitch -= width;
do
{
int w = width;
dp[0] = _sp[spitch];
do
{
*++dp = ((((_sp[0] + _sp[2] + 1) / 2 + _sp[1] + 1) / 2 + ((_sp[2*spitch] + _sp[2*spitch + 2] + 1) / 2 + _sp[2*spitch + 1] + 1) / 2 + 1)/2
+ ((_sp[spitch] + _sp[spitch + 2] + 1) / 2 + _sp[spitch + 1] + 1) / 2) / 2;
++_sp;
} while( --w );
dp[1] = _sp[spitch + 1];
dp += dpitch;
_sp += spitch2;
} while( --height );
#else
__asm SSE_RMOVE SSE7, fconvolution_bias
__asm mov eax, hblocks
__asm mov ebx, spitch
__asm pxor SSE6, SSE6
#ifdef MODIFYPLUGIN
__asm mov ecx, eax
#endif
__asm mov edx, remainder
#if SSE_INCREMENT == 16
__asm add eax, eax
#endif
__asm mov esi, _sp
#ifdef MODIFYPLUGIN
__asm lea eax, [eax * 8 + edx]
#else
__asm lea eax, [eax * 8 + edx + SSE_INCREMENT + 1]
#endif
__asm sub esi, ebx
__asm sub dpitch, eax
__asm neg eax
__asm mov edi, dp
#ifdef MODIFYPLUGIN
__asm inc edi
__asm lea eax, [ebx + eax]
#else
__asm lea eax, [ebx + eax + 1]
__asm align 16
__asm column_loop:
fconvolution_w1(edi, esi, ebx, SSE6, SSE7, SSE0, SSE1, SSE2, SSE3, SSE4, SSE5)
// now the pixels in the middle
__asm add esi, SSE_INCREMENT
__asm add edi, SSE_INCREMENT + 1
__asm mov ecx, hblocks
#endif // MODIFYPLUGIN
__asm align 16
__asm middle_loop:
fconvolution(edi, esi, ebx, SSE6, SSE7, SSE0, SSE1, SSE2, SSE3, SSE4, SSE5)
__asm add esi, SSE_INCREMENT
__asm add edi, SSE_INCREMENT
__asm dec ecx
__asm jnz middle_loop
// the last pixels
__asm add esi, edx
__asm add edi, edx
fconvolution_w2(edi, esi, ebx, SSE6, SSE7, SSE0, SSE1, SSE2, SSE3, SSE4, SSE5)
__asm add esi, eax
__asm add edi, dpitch
__asm dec height
#ifdef MODIFYPLUGIN
__asm mov ecx, hblocks
__asm jnz middle_loop
#else
__asm jnz column_loop
#endif
#endif
}
#if ISSE > 1
#define rconvolution(daddr, saddr, spitch, nullreg, bias_correction, reg0, reg1, reg2, reg3, reg4, reg5) \
__asm SSE3_MOVE reg0, [saddr] \
__asm SSE3_MOVE reg2, [saddr + 2] \
__asm SSE3_MOVE reg1, [saddr + 2*spitch] \
__asm SSE3_MOVE reg3, [saddr + 2*spitch + 2]\
__asm pavgb reg0, reg2 \
__asm pavgb reg1, reg3 \
__asm SSE3_MOVE reg2, [saddr + spitch] \
__asm SSE3_MOVE reg3, [saddr + 1] \
__asm pavgb reg0, reg1 \
__asm SSE3_MOVE reg4, [saddr + 2*spitch + 1]\
__asm SSE3_MOVE reg5, [saddr + spitch + 2]\
__asm pavgb reg2, reg4 \
__asm pavgb reg3, reg5 \
__asm psubusb reg0, bias_correction \
__asm pavgb reg2, reg3 \
__asm pavgb reg0, reg2 \
__asm SSE_MOVE [daddr], reg0
#define rconvolution_w1(daddr, saddr, spitch, nullreg, bias_correction, reg0, reg1, reg2, reg3, reg4, reg5) \
__asm SSE3_MOVE reg0, [saddr] \
__asm SSE3_MOVE reg2, [saddr + 2] \
__asm SSE3_MOVE reg1, [saddr + 2*spitch] \
__asm SSE3_MOVE reg3, [saddr + 2*spitch + 2]\
__asm pavgb reg0, reg2 \
__asm pavgb reg1, reg3 \
__asm SSE3_MOVE reg2, [saddr + spitch] \
__asm SSE3_MOVE reg3, [saddr + 1] \
__asm pavgb reg0, reg1 \
__asm movd [daddr], reg2 \
__asm SSE3_MOVE reg4, [saddr + 2*spitch + 1]\
__asm SSE3_MOVE reg5, [saddr + spitch + 2]\
__asm pavgb reg2, reg4 \
__asm pavgb reg3, reg5 \
__asm psubusb reg0, bias_correction \
__asm pavgb reg2, reg3 \
__asm pavgb reg0, reg2 \
__asm SSE_MOVE [daddr + 1], reg0
#define rconvolution_w2(daddr, saddr, spitch, nullreg, bias_correction, reg0, reg1, reg2, reg3, reg4, reg5) \
__asm SSE3_MOVE reg0, [saddr] \
__asm SSE3_MOVE reg2, [saddr + 2] \
__asm SSE3_MOVE reg1, [saddr + 2*spitch] \
__asm SSE3_MOVE reg3, [saddr + 2*spitch + 2]\
__asm pavgb reg0, reg2 \
__asm pavgb reg1, reg3 \
__asm SSE3_MOVE reg2, [saddr + spitch] \
__asm SSE3_MOVE reg3, [saddr + 1] \
__asm pavgb reg0, reg1 \
__asm SSE3_MOVE reg4, [saddr + 2*spitch + 1]\
__asm SSE3_MOVE reg5, [saddr + spitch + 2]\
__asm pavgb reg2, reg4 \
__asm pavgb reg3, reg5 \
__asm SSE_MOVE [daddr + 1], reg5 \
__asm pavgb reg2, reg3 \
__asm psubusb reg0, bias_correction \
__asm pavgb reg0, reg2 \
__asm SSE_MOVE [daddr], reg0
#else
#define rconvolution(daddr, saddr, spitch, nullreg, bias_correction, reg0, reg1, reg2, reg3, reg4, reg5)\
__asm SSE3_MOVE reg0, [saddr] \
__asm SSE3_MOVE reg1, [saddr + 2*spitch] \
__asm pavgb reg0, [saddr + 2] \
__asm pavgb reg1, [saddr + 2*spitch + 2]\
__asm SSE3_MOVE reg2, [saddr + spitch] \
__asm SSE3_MOVE reg3, [saddr + 1] \
__asm pavgb reg0, reg1 \
__asm pavgb reg2, [saddr + 2*spitch + 1]\
__asm pavgb reg3, [saddr + spitch + 2]\
__asm psubusb reg0, bias_correction \
__asm pavgb reg2, reg3 \
__asm pavgb reg0, reg2 \
__asm SSE_MOVE [daddr], reg0
#define rconvolution_w1(daddr, saddr, spitch, nullreg, bias_correction, reg0, reg1, reg2, reg3, reg4, reg5)\
__asm SSE3_MOVE reg0, [saddr] \
__asm SSE3_MOVE reg1, [saddr + 2*spitch] \
__asm pavgb reg0, [saddr + 2] \
__asm pavgb reg1, [saddr + 2*spitch + 2]\
__asm SSE3_MOVE reg2, [saddr + spitch] \
__asm SSE3_MOVE reg3, [saddr + 1] \
__asm movd [daddr], reg2 \
__asm pavgb reg0, reg1 \
__asm pavgb reg2, [saddr + 2*spitch + 1]\
__asm pavgb reg3, [saddr + spitch + 2]\
__asm psubusb reg0, bias_correction \
__asm pavgb reg2, reg3 \
__asm pavgb reg0, reg2 \
__asm SSE_MOVE [daddr + 1], reg0
#define rconvolution_w2(daddr, saddr, spitch, nullreg, bias_correction, reg0, reg1, reg2, reg3, reg4, reg5)\
__asm SSE3_MOVE reg0, [saddr] \
__asm SSE3_MOVE reg1, [saddr + 2*spitch] \
__asm pavgb reg0, [saddr + 2] \
__asm pavgb reg1, [saddr + 2*spitch + 2]\
__asm SSE3_MOVE reg2, [saddr + spitch] \
__asm SSE3_MOVE reg3, [saddr + spitch + 2]\
__asm pavgb reg0, reg1 \
__asm SSE_MOVE [daddr + 1], reg3 \
__asm pavgb reg2, [saddr + 2*spitch + 1]\
__asm pavgb reg3, [saddr + 1] \
__asm psubusb reg0, bias_correction \
__asm pavgb reg2, reg3 \
__asm pavgb reg0, reg2 \
__asm SSE_MOVE [daddr], reg0
#endif
#define rbias 1
static const __declspec(align(SSE_INCREMENT)) unsigned char rconvolution_bias[SSE_INCREMENT] =
{
rbias, rbias, rbias, rbias, rbias, rbias, rbias, rbias
#if SSE_INCREMENT == 16
, rbias, rbias, rbias, rbias, rbias, rbias, rbias, rbias
#endif
};
void SSE_RemoveGrain19(BYTE *dp, int dpitch, const BYTE *_sp, int spitch, int hblocks, int remainder, int incpitch, int height)
{
#ifdef CVERSION
_sp -= spitch;
int width = (hblocks + 2) * SSE_INCREMENT + remainder;
int spitch2 = spitch - width;
dpitch -= width;
do
{
int w = width;
dp[0] = _sp[spitch];
do
{
*++dp = (_sp[0] + _sp[1] + _sp[2] + _sp[spitch] + _sp[spitch + 2] + _sp[2*spitch] + _sp[2*spitch + 1] + _sp[2*spitch + 2] + 4) / 8;
++_sp;
} while( --w );
dp[1] = _sp[spitch + 1];
dp += dpitch;
_sp += spitch2;
} while( --height );
#else
__asm SSE_RMOVE SSE7, rconvolution_bias
__asm mov eax, hblocks
__asm mov ebx, spitch
__asm pxor SSE6, SSE6
__asm mov edx, remainder
#if SSE_INCREMENT == 16
__asm add eax, eax
#endif
__asm mov esi, _sp
__asm lea eax, [eax * 8 + edx + SSE_INCREMENT + 1]
__asm sub esi, ebx
__asm sub dpitch, eax
__asm neg eax
__asm mov edi, dp
__asm lea eax, [ebx + eax + 1]
__asm align 16
__asm column_loop:
rconvolution_w1(edi, esi, ebx, SSE6, SSE7, SSE0, SSE1, SSE2, SSE3, SSE4, SSE5)
// now the pixels in the middle
__asm add esi, SSE_INCREMENT
__asm add edi, SSE_INCREMENT + 1
__asm mov ecx, hblocks
__asm align 16
__asm middle_loop:
rconvolution(edi, esi, ebx, SSE6, SSE7, SSE0, SSE1, SSE2, SSE3, SSE4, SSE5)
__asm add esi, SSE_INCREMENT
__asm add edi, SSE_INCREMENT
__asm dec ecx
__asm jnz middle_loop
// the last pixels
__asm add esi, edx
__asm add edi, edx
rconvolution_w2(edi, esi, ebx, SSE6, SSE7, SSE0, SSE1, SSE2, SSE3, SSE4, SSE5)
__asm add esi, eax
__asm add edi, dpitch
__asm dec height
__asm jnz column_loop
#endif
}
#define DeringA(ldiff, udiff, center, p1, p2, reg1, reg2) \
__asm SSE3_MOVE reg2, p1 \
__asm SSE3_MOVE reg1, p2 \
__asm SSE_RMOVE ldiff, reg2 \
__asm SSE_RMOVE udiff, center \
__asm pmaxub reg2, reg1 \
__asm pminub ldiff, reg1 \
__asm psubusb udiff, reg2 \
__asm psubusb reg2, ldiff \
__asm psubusb ldiff, center \
__asm pminub udiff, reg2 \
__asm pminub ldiff, reg2
#define DeringA_w(ldiff, udiff, center, p1, p2, mwrite, wmem, reg1, reg2) \
__asm SSE3_MOVE reg2, p1 \
__asm SSE3_MOVE reg1, p2 \
__asm SSE_RMOVE ldiff, reg2 \
__asm SSE_RMOVE udiff, center \
__asm pmaxub reg2, reg1 \
__asm pminub ldiff, reg1 \
__asm psubusb udiff, reg2 \
__asm mwrite wmem, reg1 \
__asm psubusb reg2, ldiff \
__asm psubusb ldiff, center \
__asm pminub udiff, reg2 \
__asm pminub ldiff, reg2
#define DeringM(ldiff, udiff, center, reg1) \
__asm psubusb center, udiff \
__asm paddusb center, ldiff
void SSE_RemoveGrain23(BYTE *dp, int dpitch, const BYTE *_sp, int spitch, int hblocks, int remainder, int incpitch, int height)
{
__asm mov eax, hblocks
__asm mov ebx, spitch
__asm mov edx, remainder
#if SSE_INCREMENT == 16
__asm add eax, eax
#endif
__asm mov esi, _sp
__asm lea eax, [eax * 8 + edx + SSE_INCREMENT + 1]
__asm sub esi, ebx
__asm sub dpitch, eax
__asm neg eax
__asm mov edi, dp
__asm lea eax, [ebx + eax + 1]
__asm align 16
__asm column_loop:
__asm SSE3_MOVE SSE0, [esi + ebx + 1]
DeringA(SSE1, SSE2, SSE0, [esi], [esi + 2*ebx + 2], SSE5, SSE6)
DeringA(SSE3, SSE4, SSE0, [esi + 2], [esi + 2*ebx], SSE5, SSE6)
__asm pmaxub SSE2, SSE4
__asm pmaxub SSE1, SSE3
DeringA_w(SSE3, SSE4, SSE0, [esi + ebx + 2], [esi + ebx], movd, [edi], SSE5, SSE6)
__asm pmaxub SSE2, SSE4
__asm pmaxub SSE1, SSE3
DeringA(SSE3, SSE4, SSE0, [esi + 1], [esi + 2*ebx + 1], SSE5, SSE6)
__asm pmaxub SSE2, SSE4
__asm pmaxub SSE1, SSE3
DeringM(SSE1, SSE2, SSE0, SSE5)
__asm SSE_MOVE [edi + 1], SSE0
// now the pixels in the middle
__asm add esi, SSE_INCREMENT
__asm add edi, SSE_INCREMENT + 1
__asm mov ecx, hblocks
__asm align 16
__asm middle_loop:
__asm SSE3_MOVE SSE0, [esi + ebx + 1]
DeringA(SSE1, SSE2, SSE0, [esi], [esi + 2*ebx + 2], SSE5, SSE6)
DeringA(SSE3, SSE4, SSE0, [esi + 2], [esi + 2*ebx], SSE5, SSE6)
__asm pmaxub SSE2, SSE4
__asm pmaxub SSE1, SSE3
DeringA(SSE3, SSE4, SSE0, [esi + ebx], [esi + ebx + 2], SSE5, SSE6)
__asm pmaxub SSE2, SSE4
__asm pmaxub SSE1, SSE3
DeringA(SSE3, SSE4, SSE0, [esi + 1], [esi + 2*ebx + 1], SSE5, SSE6)
__asm pmaxub SSE2, SSE4
__asm pmaxub SSE1, SSE3
DeringM(SSE1, SSE2, SSE0, SSE5)
__asm SSE_MOVE [edi], SSE0
__asm add esi, SSE_INCREMENT
__asm add edi, SSE_INCREMENT
__asm dec ecx
__asm jnz middle_loop
// the last pixels
__asm add esi, edx
__asm add edi, edx
__asm SSE3_MOVE SSE0, [esi + ebx + 1]
DeringA(SSE1, SSE2, SSE0, [esi], [esi + 2*ebx + 2], SSE5, SSE6)
DeringA(SSE3, SSE4, SSE0, [esi + 2], [esi + 2*ebx], SSE5, SSE6)
__asm pmaxub SSE2, SSE4
__asm pmaxub SSE1, SSE3
DeringA_w(SSE3, SSE4, SSE0, [esi + ebx], [esi + ebx + 2], SSE_MOVE, [edi + 1], SSE5, SSE6)
__asm pmaxub SSE2, SSE4
__asm pmaxub SSE1, SSE3
DeringA(SSE3, SSE4, SSE0, [esi + 1], [esi + 2*ebx + 1], SSE5, SSE6)
__asm pmaxub SSE2, SSE4
__asm pmaxub SSE1, SSE3
DeringM(SSE1, SSE2, SSE0, SSE5)
__asm SSE_MOVE [edi], SSE0
__asm add esi, eax
__asm add edi, dpitch
__asm dec height
__asm jnz column_loop
}
#define DeringC(ldiff, udiff, center, p1, p2, reg1, reg2) \
__asm SSE3_MOVE reg2, p1 \
__asm SSE3_MOVE reg1, p2 \
__asm SSE_RMOVE ldiff, reg2 \
__asm SSE_RMOVE udiff, center \
__asm pmaxub reg2, reg1 \
__asm pminub ldiff, reg1 \
__asm psubusb udiff, reg2 \
__asm psubusb reg2, ldiff \
__asm psubusb ldiff, center \
__asm SSE_RMOVE reg1, reg2 \
__asm psubusb reg2, udiff \
__asm psubusb reg1, ldiff \
__asm pminub udiff, reg2 \
__asm pminub ldiff, reg1
#define DeringC_w(ldiff, udiff, center, p1, p2, mwrite, wmem, reg1, reg2) \
__asm SSE3_MOVE reg2, p1 \
__asm SSE3_MOVE reg1, p2 \
__asm SSE_RMOVE ldiff, reg2 \
__asm SSE_RMOVE udiff, center \
__asm pmaxub reg2, reg1 \
__asm pminub ldiff, reg1 \
__asm psubusb udiff, reg2 \
__asm mwrite wmem, reg1 \
__asm psubusb reg2, ldiff \
__asm psubusb ldiff, center \
__asm SSE_RMOVE reg1, reg2 \
__asm psubusb reg2, udiff \
__asm psubusb reg1, ldiff \
__asm pminub udiff, reg2 \
__asm pminub ldiff, reg1
void SSE_RemoveGrain24(BYTE *dp, int dpitch, const BYTE *_sp, int spitch, int hblocks, int remainder, int incpitch, int height)
{
__asm mov eax, hblocks
__asm mov ebx, spitch
__asm mov edx, remainder
#if SSE_INCREMENT == 16
__asm add eax, eax
#endif
__asm mov esi, _sp
__asm lea eax, [eax * 8 + edx + SSE_INCREMENT + 1]
__asm sub esi, ebx
__asm sub dpitch, eax
__asm neg eax
__asm mov edi, dp
__asm lea eax, [ebx + eax + 1]
__asm align 16
__asm column_loop:
__asm SSE3_MOVE SSE0, [esi + ebx + 1]
DeringC(SSE1, SSE2, SSE0, [esi], [esi + 2*ebx + 2], SSE5, SSE6)
DeringC(SSE3, SSE4, SSE0, [esi + 2], [esi + 2*ebx], SSE5, SSE6)
__asm pmaxub SSE2, SSE4
__asm pmaxub SSE1, SSE3
DeringC_w(SSE3, SSE4, SSE0, [esi + ebx + 2], [esi + ebx], movd, [edi], SSE5, SSE6)
__asm pmaxub SSE2, SSE4
__asm pmaxub SSE1, SSE3
DeringC(SSE3, SSE4, SSE0, [esi + 1], [esi + 2*ebx + 1], SSE5, SSE6)
__asm pmaxub SSE2, SSE4
__asm pmaxub SSE1, SSE3
DeringM(SSE1, SSE2, SSE0, SSE5)
__asm SSE_MOVE [edi + 1], SSE0
// now the pixels in the middle
__asm add esi, SSE_INCREMENT
__asm add edi, SSE_INCREMENT + 1
__asm mov ecx, hblocks
__asm align 16
__asm middle_loop:
__asm SSE3_MOVE SSE0, [esi + ebx + 1]
DeringC(SSE1, SSE2, SSE0, [esi], [esi + 2*ebx + 2], SSE5, SSE6)
DeringC(SSE3, SSE4, SSE0, [esi + 2], [esi + 2*ebx], SSE5, SSE6)
__asm pmaxub SSE2, SSE4
__asm pmaxub SSE1, SSE3
DeringC(SSE3, SSE4, SSE0, [esi + ebx], [esi + ebx + 2], SSE5, SSE6)
__asm pmaxub SSE2, SSE4
__asm pmaxub SSE1, SSE3
DeringC(SSE3, SSE4, SSE0, [esi + 1], [esi + 2*ebx + 1], SSE5, SSE6)
__asm pmaxub SSE2, SSE4
__asm pmaxub SSE1, SSE3
DeringM(SSE1, SSE2, SSE0, SSE5)
__asm SSE_MOVE [edi], SSE0
__asm add esi, SSE_INCREMENT
__asm add edi, SSE_INCREMENT
__asm dec ecx
__asm jnz middle_loop
// the last pixels
__asm add esi, edx
__asm add edi, edx
__asm SSE3_MOVE SSE0, [esi + ebx + 1]
DeringC(SSE1, SSE2, SSE0, [esi], [esi + 2*ebx + 2], SSE5, SSE6)
DeringC(SSE3, SSE4, SSE0, [esi + 2], [esi + 2*ebx], SSE5, SSE6)
__asm pmaxub SSE2, SSE4
__asm pmaxub SSE1, SSE3
DeringC_w(SSE3, SSE4, SSE0, [esi + ebx], [esi + ebx + 2], SSE_MOVE, [edi + 1], SSE5, SSE6)
__asm pmaxub SSE2, SSE4
__asm pmaxub SSE1, SSE3
DeringC(SSE3, SSE4, SSE0, [esi + 1], [esi + 2*ebx + 1], SSE5, SSE6)
__asm pmaxub SSE2, SSE4
__asm pmaxub SSE1, SSE3
DeringM(SSE1, SSE2, SSE0, SSE5)
__asm SSE_MOVE [edi], SSE0
__asm add esi, eax
__asm add edi, dpitch
__asm dec height
__asm jnz column_loop
}
#define get_min_weightw1(min, weight, mem1, mem2, wmem, reg) \
__asm SSE3_MOVE min, mem1 \
__asm SSE3_MOVE reg, mem2 \
__asm movd wmem, min \
__asm SSE_RMOVE weight, min \
__asm pminub min, reg \
__asm pmaxub weight, reg \
__asm psubusb weight, min
void WeirdBob(BYTE *dp, int dpitch, const BYTE *_sp, int spitch, int hblocks, int remainder, int incpitch, int height)
{
#ifdef CVERSION
int width = (hblocks + 2) * SSE_INCREMENT + remainder;
int spitch2 = 2*spitch - width - 1, dpitch2 = 2*dpitch - width - 1;
_sp -= spitch;
do
{
dp[0] = (BYTE)(((unsigned)_sp[0] + (unsigned)(dp[dpitch] = _sp[2*spitch]) + 1) / 2);
++dp;
int w = width;
do
{
unsigned weight1, min1, weight2, min2;
min1 = _sp[0];
weight1 = _sp[2*spitch + 2];
if( weight1 < min1 )
{
min1 = weight1;
weight1 = _sp[0];
}
weight1 -= min1;
min2 = _sp[2];
weight2 = _sp[2*spitch];
if( weight2 < min2 )
{
min2 = weight2;
weight2 = _sp[2];
}
weight2 -= min2;
if( weight2 <= weight1 )
{
weight1 = weight2;
min1 = min2;
}
++_sp;
min2 = _sp[0];
weight2 = dp[dpitch] = _sp[2*spitch];
if( weight2 < min2 )
{
min2 = weight2;
weight2 = _sp[0];
}
weight2 -= min2;
if( weight2 <= weight1 )
{
weight1 = weight2;
min1 = min2;
}
dp[0] = (BYTE) (min1 + (weight1 + 1)/2);
++dp;
} while( --w );
++_sp;
dp[0] = (BYTE)(((unsigned)_sp[0] + (unsigned)(dp[dpitch] = _sp[2*spitch]) + 1)/2);
dp += dpitch2;
_sp += spitch2;
} while( --height );
#else // CVERSION
__asm mov ecx, incpitch
__asm mov eax, dpitch
__asm lea ebx, [2*eax + ecx]
__asm mov edx, remainder
__asm mov dpitch, ebx
__asm mov esi, _sp
__asm mov ebx, spitch
__asm mov edi, dp
__asm sub esi, ebx
__asm add ebx, ebx
__asm add ecx, ebx
__asm mov spitch, ecx
__asm mov ecx, hblocks
__asm align 16
__asm column_loop:
get_min_weight(SSE0, SSE1, SSE7, [esi + ebx + 2], [esi], SSE7)
get_min_weightw(SSE2, SSE3, SSE7, [esi + 2], [esi + ebx], [edi + eax], SSE6)
__asm pavgb SSE6, SSE7
mergeweighted(SSE0, SSE1, SSE2, SSE3)
__asm movd [edi], SSE6
get_min_weight(SSE2, SSE3, SSE7, [esi + ebx + 1], [esi + 1], SSE7)
mergeweighted(SSE0, SSE1, SSE2, SSE3)
__asm paddusb SSE1, SSE0
__asm pavgb SSE0, SSE1
__asm SSE_MOVE [edi + 1], SSE0
// now the pixels in the middle
__asm add esi, SSE_INCREMENT
__asm mov ecx, hblocks
__asm add edi, SSE_INCREMENT
__asm align 16
__asm middle_loop:
get_min_weight(SSE0, SSE1, SSE7, [esi], [esi + ebx + 2], SSE7)
get_min_weightw(SSE2, SSE3, SSE7, [esi + 2], [esi + ebx], [edi + eax], SSE7)
mergeweighted(SSE0, SSE1, SSE2, SSE3)
get_min_weight(SSE2, SSE3, SSE7, [esi + 1], [esi + ebx + 1], SSE7)
mergeweighted(SSE0, SSE1, SSE2, SSE3)
__asm paddusb SSE1, SSE0
__asm pavgb SSE0, SSE1
__asm add esi, SSE_INCREMENT
__asm SSE_MOVE [edi + 1], SSE0
__asm add edi, SSE_INCREMENT
__asm dec ecx
__asm jnz middle_loop
// the last pixels
__asm add esi, edx
__asm add edi, edx
get_min_weightw(SSE0, SSE1, SSE7, [esi], [esi + ebx + 2], [edi + eax + 2], SSE7)
get_min_weightw1(SSE2, SSE3, [esi + ebx], [esi + 2], [edi + eax], SSE6)
__asm pavgb SSE6, SSE7
mergeweighted(SSE0, SSE1, SSE2, SSE3)
__asm SSE_MOVE [edi + 2], SSE6
get_min_weight(SSE2, SSE3, SSE7, [esi + ebx + 1], [esi + 1], SSE7)
mergeweighted(SSE0, SSE1, SSE2, SSE3)
__asm paddusb SSE1, SSE0
__asm pavgb SSE0, SSE1
__asm add esi, spitch
__asm SSE_MOVE [edi + 1], SSE0
__asm add edi, dpitch
__asm dec height
__asm jnz column_loop
#endif // CVERSION
}
void bob_top(BYTE *dp, int dpitch, const BYTE *_sp, int spitch, int hblocks, int remainder, int incpitch, int height)
{
memcpy(dp,_sp, (hblocks + 2) * SSE_INCREMENT + remainder + 2);
WeirdBob(dp + dpitch, dpitch, _sp + spitch, spitch, hblocks, remainder, incpitch, height / 2);
}
void bob_bottom(BYTE *dp, int dpitch, const BYTE *_sp, int spitch, int hblocks, int remainder, int incpitch, int height)
{
WeirdBob(dp, dpitch, _sp, spitch, hblocks, remainder, incpitch, (height + 1)/2);
}
#define get_min_weight_average(min, weight, average, mem1, mem2, reg) \
__asm SSE3_MOVE min, mem1 \
__asm SSE3_MOVE reg, mem2 \
__asm SSE_RMOVE weight, min \
__asm SSE_RMOVE average, min \
__asm pmaxub weight, reg \
__asm pminub min, reg \
__asm pavgb average, reg \
__asm psubusb weight, min
void SmartBob(BYTE *dp, int dpitch, const BYTE *_sp, int spitch, int hblocks, int remainder, int incpitch, int height)
{
#ifdef CVERSION
int width = (hblocks + 2) * SSE_INCREMENT + remainder;
int spitch2 = 2*spitch - width - 1, dpitch2 = 2*dpitch - width - 1;
_sp -= spitch;
do
{
dp[0] = (BYTE)(((unsigned)_sp[0] + (unsigned)(dp[dpitch] = _sp[2*spitch]) + 1) / 2);
++dp;
int w = width;
do
{
#if 1
unsigned weight1, min1, weight2, min2, average;
min1 = _sp[0];
weight1 = _sp[2*spitch + 2];
if( weight1 < min1 )
{
min1 = weight1;
weight1 = _sp[0];
}
average = weight1 + min1;
weight1 -= min1;
min2 = _sp[2];
weight2 = _sp[2*spitch];
if( weight2 < min2 )
{
min2 = weight2;
weight2 = _sp[2];
}
average += min2 + weight2;
weight2 -= min2;
if( weight2 <= weight1 )
{
weight1 = weight2;
min1 = min2;
}
++_sp;
min2 = _sp[0];
weight2 = dp[dpitch] = _sp[2*spitch];
if( weight2 < min2 )
{
min2 = weight2;
weight2 = _sp[0];
}
average += 2*(weight2 + min2);
weight2 -= min2;
if( weight2 <= weight1 )
{
weight1 = weight2;
min1 = min2;
}
average = (average + 4)/8;
weight1 += min1;
if( weight1 < average ) average = weight1;
else if( min1 > average ) average = min1;
dp[0] = (BYTE) average;
++dp;
#else
unsigned weight1, min1, weight2, min2, average;
min1 = _sp[0];
weight1 = _sp[2*spitch + 2];
if( weight1 < min1 )
{
min1 = weight1;
weight1 = _sp[0];
}
average = (weight1 + min1 + 1)/2;
weight1 -= min1;
min2 = _sp[2];
weight2 = _sp[2*spitch];
if( weight2 < min2 )
{
min2 = weight2;
weight2 = _sp[2];
}
average = (average + (min2 + weight2 + 1)/2 + 1)/2 - 1;
if( (int) average < 0 ) average = 0;
weight2 -= min2;
if( weight2 <= weight1 )
{
weight1 = weight2;
min1 = min2;
}
++_sp;
min2 = _sp[0];
weight2 = dp[dpitch] = _sp[2*spitch];
if( weight2 < min2 )
{
min2 = weight2;
weight2 = _sp[0];
}
average = (average + (weight2 + min2 + 1)/2 + 1)/2;
weight2 -= min2;
if( weight2 <= weight1 )
{
weight1 = weight2;
min1 = min2;
}
weight1 += min1;
if( weight1 < average ) average = weight1;
else if( min1 > average ) average = min1;
dp[0] = (BYTE) average;
++dp;
#endif
} while( --w );
++_sp;
dp[0] = (BYTE)(((unsigned)_sp[0] + (unsigned)(dp[dpitch] = _sp[2*spitch]) + 1)/2);
dp += dpitch2;
_sp += spitch2;
} while( --height );
#else // CVERSION
__asm mov ecx, incpitch
__asm mov eax, dpitch
__asm lea ebx, [2*eax + ecx]
__asm mov edx, remainder
__asm mov dpitch, ebx
__asm mov esi, _sp
__asm mov ebx, spitch
__asm mov edi, dp
__asm sub esi, ebx
__asm add ebx, ebx
//__asm pxor SSE6, SSE6
__asm SSE_RMOVE SSE6, fconvolution_bias
__asm add ecx, ebx
__asm mov spitch, ecx
__asm mov ecx, hblocks
__asm align 16
__asm column_loop:
get_min_weight_average(SSE0, SSE1, SSE2, [esi + ebx + 2], [esi], SSE7)
__asm SSE3_MOVE SSE4, [esi + ebx]
__asm SSE3_MOVE SSE5, [esi + 2]
__asm pavgb SSE7, SSE4
__asm SSE_RMOVE SSE3, SSE4
__asm movd [edi], SSE7
__asm SSE_RMOVE SSE7, SSE4
__asm pminub SSE3, SSE5
__asm SSE_MOVE [edi + eax], SSE7
__asm pmaxub SSE4, SSE5
__asm pavgb SSE5, SSE7
__asm psubusb SSE4, SSE3
__asm pavgb SSE2, SSE5
mergeweighted(SSE0, SSE1, SSE3, SSE4)
__asm psubusb SSE2, SSE6 // bias correction
get_min_weight_average(SSE3, SSE4, SSE5, [esi + ebx + 1], [esi + 1], SSE7)
__asm pavgb SSE2, SSE5
mergeweighted(SSE0, SSE1, SSE3, SSE4)
__asm pmaxub SSE2, SSE0
__asm paddusb SSE1, SSE0
__asm pminub SSE2, SSE1
__asm SSE_MOVE [edi + 1], SSE2
// now the pixels in the middle
__asm add esi, SSE_INCREMENT
__asm mov ecx, hblocks
__asm add edi, SSE_INCREMENT
__asm align 16
__asm middle_loop:
get_min_weight_average(SSE0, SSE1, SSE2, [esi + ebx + 2], [esi], SSE7)
get_min_weight_average(SSE3, SSE4, SSE5, [esi + 2], [esi + ebx], SSE7)
__asm SSE_MOVE [edi + eax], SSE7
__asm pavgb SSE2, SSE5
mergeweighted(SSE0, SSE1, SSE3, SSE4)
__asm psubusb SSE2, SSE6 // bias correction
get_min_weight_average(SSE3, SSE4, SSE5, [esi + 1], [esi + ebx + 1], SSE7)
__asm pavgb SSE2, SSE5
mergeweighted(SSE0, SSE1, SSE3, SSE4)
__asm pmaxub SSE2, SSE0
__asm paddusb SSE1, SSE0
__asm pminub SSE2, SSE1
__asm add esi, SSE_INCREMENT
__asm SSE_MOVE [edi + 1], SSE2
__asm add edi, SSE_INCREMENT
__asm dec ecx
__asm jnz middle_loop
// the last pixels
__asm add esi, edx
__asm add edi, edx
get_min_weight_average(SSE0, SSE1, SSE2, [esi], [esi + ebx + 2], SSE7)
__asm SSE_MOVE [edi + eax + 2], SSE7
__asm SSE3_MOVE SSE4, [esi + 2]
__asm SSE3_MOVE SSE5, [esi + ebx]
__asm pavgb SSE7, SSE4
__asm SSE_RMOVE SSE3, SSE4
__asm SSE_MOVE [edi + 2], SSE7
__asm SSE_RMOVE SSE7, SSE4
__asm pminub SSE3, SSE5
__asm movd [edi + eax], SSE5
__asm pmaxub SSE4, SSE5
__asm pavgb SSE5, SSE7
__asm psubusb SSE4, SSE3
__asm pavgb SSE2, SSE5
mergeweighted(SSE0, SSE1, SSE3, SSE4)
__asm psubusb SSE2, SSE6 // bias correction
get_min_weight_average(SSE3, SSE4, SSE5, [esi + ebx + 1], [esi + 1], SSE7)
__asm pavgb SSE2, SSE5
mergeweighted(SSE0, SSE1, SSE3, SSE4)
__asm pmaxub SSE2, SSE0
__asm paddusb SSE1, SSE0
__asm pminub SSE2, SSE1
__asm add esi, spitch
__asm SSE_MOVE [edi + 1], SSE2
__asm add edi, dpitch
__asm dec height
__asm jnz column_loop
#endif // CVERSION
}
void smartbob_top(BYTE *dp, int dpitch, const BYTE *_sp, int spitch, int hblocks, int remainder, int incpitch, int height)
{
memcpy(dp,_sp, (hblocks + 2) * SSE_INCREMENT + remainder + 2);
SmartBob(dp + dpitch, dpitch, _sp + spitch, spitch, hblocks, remainder, incpitch, height / 2);
}
void smartbob_bottom(BYTE *dp, int dpitch, const BYTE *_sp, int spitch, int hblocks, int remainder, int incpitch, int height)
{
SmartBob(dp, dpitch, _sp, spitch, hblocks, remainder, incpitch, (height + 1)/2);
}
#endif // #ifndef MODIFYPLUGIN
#ifdef SHARPEN
void SmartRG(BYTE *dp, int dpitch, const BYTE *_sp, int spitch, int hblocks, int remainder, int incpitch, int height, int strength)
#else
void SmartRG(BYTE *dp, int dpitch, const BYTE *_sp, int spitch, int hblocks, int remainder, int incpitch, int height)
#endif
{
__asm mov eax, hblocks
__asm mov ebx, spitch
#ifdef MODIFYPLUGIN
__asm mov ecx, eax
#endif
__asm mov edx, remainder
#if SSE_INCREMENT == 16
__asm add eax, eax
#endif
__asm mov esi, _sp
#ifdef MODIFYPLUGIN
__asm lea eax, [eax * 8 + edx]
#else
__asm lea eax, [eax * 8 + edx + SSE_INCREMENT + 1]
#endif
__asm sub esi, ebx
__asm sub dpitch, eax
__asm neg eax
__asm mov edi, dp
#ifdef MODIFYPLUGIN
__asm inc edi
__asm lea eax, [ebx + eax]
#else
__asm lea eax, [ebx + eax + 1]
#ifdef SHARPEN
__asm mov spitch, eax
__asm mov eax, strength
#endif
__asm align 16
__asm column_loop:
__asm SSE3_MOVE SSE0, [esi]
__asm SSE3_MOVE SSE7, [esi + 2*ebx + 2]
__asm SSE_RMOVE SSE1, SSE0
__asm pminub SSE0, SSE7
__asm pmaxub SSE1, SSE7
__asm SSE3_MOVE SSE4, [esi + 2]
__asm SSE3_MOVE SSE7, [esi + 2*ebx]
__asm SSE_RMOVE SSE5, SSE4
__asm SSE3_MOVE SSE2, [esi + 1]
__asm pminub SSE4, SSE7
__asm pmaxub SSE5, SSE7
__asm pmaxub SSE0, SSE4
__asm SSE3_MOVE SSE7, [esi + 2*ebx + 1]
__asm SSE_RMOVE SSE3, SSE2
__asm pminub SSE1, SSE5
__asm pminub SSE2, SSE7
__asm SSE3_MOVE SSE4, [esi + ebx]
__asm pmaxub SSE3, SSE7
__asm movd [edi], SSE4
__asm pmaxub SSE0, SSE2
__asm SSE3_MOVE SSE7, [esi + ebx + 2]
__asm SSE_RMOVE SSE5, SSE4
__asm pminub SSE1, SSE3
__asm pminub SSE4, SSE7
__asm pmaxub SSE5, SSE7
__asm pmaxub SSE0, SSE4
__asm pminub SSE1, SSE5
__asm SSE_RMOVE SSE2, SSE0
#if (ISSE > 1) || defined(SHLUR)
__asm SSE3_MOVE SSE4, [esi + ebx + 1]
#endif
__asm pminub SSE0, SSE1
__asm pmaxub SSE2, SSE1
#ifdef SHLUR
sharpen(SSE4, SSE0, SSE2, rshift[eax], shift_mask[eax], SSE7, SSE3)
__asm SSE_MOVE [edi + 1], SSE4
#else
#if ISSE > 1
__asm pmaxub SSE0, SSE4
#else
__asm pmaxub SSE0, [esi + ebx + 1]
#endif
__asm pminub SSE0, SSE2
__asm SSE_MOVE [edi + 1], SSE0
#endif // SHLUR
// now the pixels in the middle
__asm add esi, SSE_INCREMENT
__asm add edi, SSE_INCREMENT + 1
__asm mov ecx, hblocks
#endif // MODIFYPLUGIN
__asm align 16
__asm middle_loop:
__asm SSE3_MOVE SSE0, [esi]
__asm SSE3_MOVE SSE7, [esi + 2*ebx + 2]
__asm SSE_RMOVE SSE1, SSE0
__asm SSE3_MOVE SSE4, [esi + 2]
__asm pminub SSE0, SSE7
__asm SSE3_MOVE SSE6, [esi + 2*ebx]
__asm SSE_RMOVE SSE5, SSE4
__asm pmaxub SSE1, SSE7
__asm pminub SSE4, SSE6
__asm SSE3_MOVE SSE2, [esi + 1]
__asm pmaxub SSE5, SSE6
__asm pmaxub SSE0, SSE4
__asm SSE3_MOVE SSE7, [esi + 2*ebx + 1]
__asm SSE_RMOVE SSE3, SSE2
__asm pminub SSE1, SSE5
__asm pminub SSE2, SSE7
__asm SSE3_MOVE SSE4, [esi + ebx]
__asm pmaxub SSE3, SSE7
__asm pmaxub SSE0, SSE2
__asm SSE3_MOVE SSE7, [esi + ebx + 2]
__asm SSE_RMOVE SSE5, SSE4
__asm pminub SSE1, SSE3
__asm pminub SSE4, SSE7
__asm pmaxub SSE5, SSE7
__asm pmaxub SSE0, SSE4
__asm pminub SSE1, SSE5
__asm SSE_RMOVE SSE2, SSE0
#if (ISSE > 1) || defined(SHLUR)
#ifdef MODIFYPLUGIN
__asm SSE3_MOVE SSE4, [edi]
#else
__asm SSE3_MOVE SSE4, [esi + ebx + 1]
#endif
#endif
#if MODIFYPLUGIN > 0
__asm SSE3_MOVE SSE5, [esi + ebx + 1]
#endif
__asm pminub SSE0, SSE1
__asm pmaxub SSE2, SSE1
#if MODIFYPLUGIN > 0
__asm pminub SSE0, SSE5
__asm pmaxub SSE2, SSE5
#endif
#ifdef SHLUR
sharpen(SSE4, SSE0, SSE2, rshift[eax], shift_mask[eax], SSE7, SSE3)
__asm add esi, SSE_INCREMENT
__asm SSE_MOVE [edi], SSE4
#else
#if ISSE > 1
__asm pmaxub SSE0, SSE4
#else
#ifdef MODIFYPLUGIN
__asm pmaxub SSE0, [edi]
#else
__asm pmaxub SSE0, [esi + ebx + 1]
#endif
#endif
__asm pminub SSE0, SSE2
__asm add esi, SSE_INCREMENT
__asm SSE_MOVE [edi], SSE0
#endif // SHLUR
__asm add edi, SSE_INCREMENT
__asm dec ecx
__asm jnz middle_loop
// the last pixels
__asm add esi, edx
__asm SSE3_MOVE SSE0, [esi]
__asm add edi, edx
__asm SSE3_MOVE SSE7, [esi + 2*ebx + 2]
__asm SSE_RMOVE SSE1, SSE0
__asm SSE3_MOVE SSE4, [esi + 2]
__asm pminub SSE0, SSE7
__asm pmaxub SSE1, SSE7
__asm SSE3_MOVE SSE6, [esi + 2*ebx]
__asm SSE_RMOVE SSE5, SSE4
__asm SSE3_MOVE SSE2, [esi + 1]
__asm pminub SSE4, SSE6
__asm pmaxub SSE5, SSE6
__asm pmaxub SSE0, SSE4
__asm SSE3_MOVE SSE7, [esi + 2*ebx + 1]
__asm SSE_RMOVE SSE3, SSE2
__asm pminub SSE1, SSE5
__asm pminub SSE2, SSE7
__asm SSE3_MOVE SSE4, [esi + ebx]
__asm pmaxub SSE3, SSE7
__asm pmaxub SSE0, SSE2
__asm SSE3_MOVE SSE7, [esi + ebx + 2]
__asm SSE_RMOVE SSE5, SSE4
#ifndef MODIFYPLUGIN
__asm SSE_MOVE [edi + 1], SSE7
#endif
__asm pminub SSE1, SSE3
__asm pminub SSE4, SSE7
__asm pmaxub SSE5, SSE7
__asm pmaxub SSE0, SSE4
__asm pminub SSE1, SSE5
__asm SSE_RMOVE SSE2, SSE0
#if (ISSE > 1) || defined(SHLUR)
#ifdef MODIFYPLUGIN
__asm SSE3_MOVE SSE4, [edi]
#else
__asm SSE3_MOVE SSE4, [esi + ebx + 1]
#endif
#endif
#if MODIFYPLUGIN > 0
__asm SSE3_MOVE SSE5, [esi + ebx + 1]
#endif
__asm pminub SSE0, SSE1
__asm pmaxub SSE2, SSE1
#if MODIFYPLUGIN > 0
__asm pminub SSE0, SSE5
__asm pmaxub SSE2, SSE5
#endif
#ifdef SHLUR
sharpen(SSE4, SSE0, SSE2, rshift[eax], shift_mask[eax], SSE7, SSE3)
#ifdef SHARPEN
__asm add esi, spitch
#else
__asm add esi, eax
#endif
__asm SSE_MOVE [edi], SSE4
#else
#if ISSE > 1
__asm pmaxub SSE0, SSE4
#else
#ifdef MODIFYPLUGIN
__asm pmaxub SSE0, [edi]
#else
__asm pmaxub SSE0, [esi + ebx + 1]
#endif
#endif
__asm pminub SSE0, SSE2
__asm add esi, eax
__asm SSE_MOVE [edi], SSE0
#endif // SHLUR
__asm add edi, dpitch
__asm dec height
#ifdef MODIFYPLUGIN
__asm mov ecx, hblocks
__asm jnz middle_loop
#else
__asm jnz column_loop
#endif
}
#ifdef SHARPEN
void SmartRGC(BYTE *dp, int dpitch, const BYTE *_sp, int spitch, int hblocks, int remainder, int incpitch, int height, int strength)
#else
void SmartRGC(BYTE *dp, int dpitch, const BYTE *_sp, int spitch, int hblocks, int remainder, int incpitch, int height)
#endif
{
__asm mov eax, hblocks
__asm mov ebx, spitch
#ifdef MODIFYPLUGIN
__asm mov ecx, eax
#endif
__asm mov edx, remainder
#if SSE_INCREMENT == 16
__asm add eax, eax
#endif
__asm mov esi, _sp
#ifdef MODIFYPLUGIN
__asm lea eax, [eax * 8 + edx]
#else
__asm lea eax, [eax * 8 + edx + SSE_INCREMENT + 1]
#endif
__asm sub esi, ebx
__asm sub dpitch, eax
__asm neg eax
__asm mov edi, dp
#ifdef MODIFYPLUGIN
__asm inc edi
__asm lea eax, [ebx + eax]
#else
__asm lea eax, [ebx + eax + 1]
#ifdef SHARPEN
__asm mov spitch, eax
__asm mov eax, strength
#endif
__asm align 16
__asm column_loop:
__asm SSE3_MOVE SSE7, [esi]
__asm SSE3_MOVE SSE6, [esi + 1]
__asm SSE_RMOVE SSE0, SSE7
__asm SSE_RMOVE SSE1, SSE6
__asm SSE3_MOVE SSE5, [esi + 2]
__asm pminub SSE0, SSE6
__asm SSE_RMOVE SSE2, SSE5
__asm pmaxub SSE1, SSE7
__asm pminub SSE2, SSE6
__asm SSE3_MOVE SSE4, [esi + ebx + 2]
__asm pmaxub SSE6, SSE5
__asm pmaxub SSE0, SSE2
__asm SSE_RMOVE SSE3, SSE4
__asm pminub SSE1, SSE6
__asm pminub SSE3, SSE5
__asm SSE3_MOVE SSE6, [esi + 2*ebx + 2]
__asm pmaxub SSE5, SSE4
__asm pmaxub SSE0, SSE3
__asm SSE_RMOVE SSE2, SSE6
__asm pminub SSE1, SSE5
__asm pminub SSE2, SSE4
__asm SSE3_MOVE SSE5, [esi + 2*ebx + 1]
__asm pmaxub SSE4, SSE6
__asm pmaxub SSE0, SSE2
__asm SSE_RMOVE SSE3, SSE5
__asm pminub SSE1, SSE4
__asm pminub SSE3, SSE6
__asm SSE3_MOVE SSE4, [esi + 2*ebx]
__asm pmaxub SSE6, SSE5
__asm pmaxub SSE0, SSE3
__asm SSE_RMOVE SSE2, SSE4
__asm pminub SSE1, SSE6
__asm pminub SSE2, SSE5
__asm SSE3_MOVE SSE6, [esi + ebx]
__asm pmaxub SSE5, SSE4
__asm pmaxub SSE0, SSE2
__asm SSE_RMOVE SSE3, SSE6
__asm pminub SSE1, SSE5
__asm movd [edi], SSE6
__asm pminub SSE3, SSE4
__asm SSE_RMOVE SSE2, SSE7
__asm pmaxub SSE4, SSE6
__asm pmaxub SSE0, SSE3
__asm pminub SSE1, SSE4
__asm pminub SSE2, SSE6
__asm pmaxub SSE7, SSE6
__asm pmaxub SSE0, SSE2
__asm pminub SSE1, SSE7
__asm SSE_RMOVE SSE2, SSE0
#if (ISSE > 1) || defined(SHLUR)
__asm SSE3_MOVE SSE4, [esi + ebx + 1]
#endif
__asm pminub SSE0, SSE1
__asm pmaxub SSE2, SSE1
#ifdef SHLUR
sharpen(SSE4, SSE0, SSE2, rshift[eax], shift_mask[eax], SSE7, SSE3)
__asm SSE_MOVE [edi + 1], SSE4
#else
#if ISSE > 1
__asm pmaxub SSE0, SSE4
#else
__asm pmaxub SSE0, [esi + ebx + 1]
#endif
__asm pminub SSE0, SSE2
__asm SSE_MOVE [edi + 1], SSE0
#endif // SHLUR
// now the pixels in the middle
__asm add esi, SSE_INCREMENT
__asm add edi, SSE_INCREMENT + 1
__asm mov ecx, hblocks
#endif // MODIFYPLUGIN
__asm align 16
__asm middle_loop:
__asm SSE3_MOVE SSE7, [esi]
__asm SSE3_MOVE SSE6, [esi + 1]
__asm SSE_RMOVE SSE0, SSE7
__asm SSE_RMOVE SSE1, SSE6
__asm SSE3_MOVE SSE5, [esi + 2]
__asm pminub SSE0, SSE6
__asm SSE_RMOVE SSE2, SSE5
__asm pmaxub SSE1, SSE7
__asm pminub SSE2, SSE6
__asm SSE3_MOVE SSE4, [esi + ebx + 2]
__asm pmaxub SSE6, SSE5
__asm pmaxub SSE0, SSE2
__asm SSE_RMOVE SSE3, SSE4
__asm pminub SSE1, SSE6
__asm pminub SSE3, SSE5
__asm SSE3_MOVE SSE6, [esi + 2*ebx + 2]
__asm pmaxub SSE5, SSE4
__asm pmaxub SSE0, SSE3
__asm SSE_RMOVE SSE2, SSE6
__asm pminub SSE1, SSE5
__asm pminub SSE2, SSE4
__asm SSE3_MOVE SSE5, [esi + 2*ebx + 1]
__asm pmaxub SSE4, SSE6
__asm pmaxub SSE0, SSE2
__asm SSE_RMOVE SSE3, SSE5
__asm pminub SSE1, SSE4
__asm pminub SSE3, SSE6
__asm SSE3_MOVE SSE4, [esi + 2*ebx]
__asm pmaxub SSE6, SSE5
__asm pmaxub SSE0, SSE3
__asm SSE_RMOVE SSE2, SSE4
__asm pminub SSE1, SSE6
__asm pminub SSE2, SSE5
__asm SSE3_MOVE SSE6, [esi + ebx]
__asm pmaxub SSE5, SSE4
__asm pmaxub SSE0, SSE2
__asm SSE_RMOVE SSE3, SSE6
__asm pminub SSE1, SSE5
__asm pminub SSE3, SSE4
__asm SSE_RMOVE SSE2, SSE7
__asm pmaxub SSE4, SSE6
__asm pmaxub SSE0, SSE3
__asm pminub SSE1, SSE4
__asm pminub SSE2, SSE6
__asm pmaxub SSE7, SSE6
__asm pmaxub SSE0, SSE2
__asm pminub SSE1, SSE7
__asm SSE_RMOVE SSE2, SSE0
#if (ISSE > 1) || defined(SHLUR)
#ifdef MODIFYPLUGIN
__asm SSE3_MOVE SSE4, [edi]
#else
__asm SSE3_MOVE SSE4, [esi + ebx + 1]
#endif
#endif
#if MODIFYPLUGIN > 0
__asm SSE3_MOVE SSE5, [esi + ebx + 1]
#endif
__asm pminub SSE0, SSE1
__asm pmaxub SSE2, SSE1
#if MODIFYPLUGIN > 0
__asm pminub SSE0, SSE5
__asm pmaxub SSE2, SSE5
#endif
#ifdef SHLUR
sharpen(SSE4, SSE0, SSE2, rshift[eax], shift_mask[eax], SSE7, SSE3)
__asm add esi, SSE_INCREMENT
__asm SSE_MOVE [edi], SSE4
#else
#if ISSE > 1
__asm pmaxub SSE0, SSE4
#else
#ifdef MODIFYPLUGIN
__asm pmaxub SSE0, [edi]
#else
__asm pmaxub SSE0, [esi + ebx + 1]
#endif
#endif
__asm pminub SSE0, SSE2
__asm add esi, SSE_INCREMENT
__asm SSE_MOVE [edi], SSE0
#endif // SHLUR
__asm add edi, SSE_INCREMENT
__asm dec ecx
__asm jnz middle_loop
// the last pixels
__asm add esi, edx
__asm SSE3_MOVE SSE7, [esi]
__asm add edi, edx
__asm SSE3_MOVE SSE6, [esi + 1]
__asm SSE_RMOVE SSE0, SSE7
__asm SSE_RMOVE SSE1, SSE6
__asm SSE3_MOVE SSE5, [esi + 2]
__asm pminub SSE0, SSE6
__asm SSE_RMOVE SSE2, SSE5
__asm pmaxub SSE1, SSE7
__asm pminub SSE2, SSE6
__asm SSE3_MOVE SSE4, [esi + ebx + 2]
__asm pmaxub SSE6, SSE5
__asm pmaxub SSE0, SSE2
__asm SSE_RMOVE SSE3, SSE4
__asm pminub SSE1, SSE6
__asm SSE_MOVE [edi + 1], SSE4
__asm pminub SSE3, SSE5
__asm SSE3_MOVE SSE6, [esi + 2*ebx + 2]
__asm pmaxub SSE5, SSE4
__asm pmaxub SSE0, SSE3
__asm SSE_RMOVE SSE2, SSE6
__asm pminub SSE1, SSE5
__asm pminub SSE2, SSE4
__asm SSE3_MOVE SSE5, [esi + 2*ebx + 1]
__asm pmaxub SSE4, SSE6
__asm pmaxub SSE0, SSE2
__asm SSE_RMOVE SSE3, SSE5
__asm pminub SSE1, SSE4
__asm pminub SSE3, SSE6
__asm SSE3_MOVE SSE4, [esi + 2*ebx]
__asm pmaxub SSE6, SSE5
__asm pmaxub SSE0, SSE3
__asm SSE_RMOVE SSE2, SSE4
__asm pminub SSE1, SSE6
__asm pminub SSE2, SSE5
__asm SSE3_MOVE SSE6, [esi + ebx]
__asm pmaxub SSE5, SSE4
__asm pmaxub SSE0, SSE2
__asm SSE_RMOVE SSE3, SSE6
__asm pminub SSE1, SSE5
__asm pminub SSE3, SSE4
__asm SSE_RMOVE SSE2, SSE7
__asm pmaxub SSE4, SSE6
__asm pmaxub SSE0, SSE3
__asm pminub SSE1, SSE4
__asm pminub SSE2, SSE6
__asm pmaxub SSE7, SSE6
__asm pmaxub SSE0, SSE2
__asm pminub SSE1, SSE7
__asm SSE_RMOVE SSE2, SSE0
#if (ISSE > 1) || defined(SHLUR)
#ifdef MODIFYPLUGIN
__asm SSE3_MOVE SSE4, [edi]
#else
__asm SSE3_MOVE SSE4, [esi + ebx + 1]
#endif
#endif
#if MODIFYPLUGIN > 0
__asm SSE3_MOVE SSE5, [esi + ebx + 1]
#endif
__asm pminub SSE0, SSE1
__asm pmaxub SSE2, SSE1
#if MODIFYPLUGIN > 0
__asm pminub SSE0, SSE5
__asm pmaxub SSE2, SSE5
#endif
#ifdef SHLUR
sharpen(SSE4, SSE0, SSE2, rshift[eax], shift_mask[eax], SSE7, SSE3)
#ifdef SHARPEN
__asm add esi, spitch
#else
__asm add esi, eax
#endif
__asm SSE_MOVE [edi], SSE4
#else
#if ISSE > 1
__asm pmaxub SSE0, SSE4
#else
#ifdef MODIFYPLUGIN
__asm pmaxub SSE0, [edi]
#else
__asm pmaxub SSE0, [esi + ebx + 1]
#endif
#endif
__asm pminub SSE0, SSE2
__asm add esi, eax
__asm SSE_MOVE [edi], SSE0
#endif // SHLUR
__asm add edi, dpitch
__asm dec height
#ifdef MODIFYPLUGIN
__asm mov ecx, hblocks
__asm jnz middle_loop
#else
__asm jnz column_loop
#endif
}
#ifdef SHARPEN
void SmartRGCL(BYTE *dp, int dpitch, const BYTE *_sp, int spitch, int hblocks, int remainder, int incpitch, int height, int strength)
#else
void SmartRGCL(BYTE *dp, int dpitch, const BYTE *_sp, int spitch, int hblocks, int remainder, int incpitch, int height)
#endif
{
__asm mov eax, hblocks
__asm mov ebx, spitch
#ifdef MODIFYPLUGIN
__asm mov ecx, eax
#endif
__asm mov edx, remainder
#if SSE_INCREMENT == 16
__asm add eax, eax
#endif
__asm mov esi, _sp
#ifdef MODIFYPLUGIN
__asm lea eax, [eax * 8 + edx]
#else
__asm lea eax, [eax * 8 + edx + SSE_INCREMENT + 1]
#endif
__asm sub esi, ebx
__asm sub dpitch, eax
__asm neg eax
__asm mov edi, dp
#ifdef MODIFYPLUGIN
__asm inc edi
__asm lea eax, [ebx + eax]
#else
__asm lea eax, [ebx + eax + 1]
#ifdef SHARPEN
__asm mov spitch, eax
__asm mov eax, strength
#endif
__asm align 16
__asm column_loop:
__asm SSE3_MOVE SSE4, [esi]
__asm SSE3_MOVE SSE5, [esi + 2*ebx + 2]
__asm SSE_RMOVE SSE0, SSE4
__asm SSE_RMOVE SSE1, SSE5
__asm SSE3_MOVE SSE6, [esi + 1]
__asm pminub SSE0, SSE5
__asm SSE_RMOVE SSE2, SSE6
__asm pmaxub SSE1, SSE4
__asm SSE3_MOVE SSE7, [esi + 2*ebx + 1]
__asm pminub SSE2, SSE4
__asm SSE_RMOVE SSE3, SSE7
__asm pmaxub SSE4, SSE6
__asm pminub SSE3, SSE5
__asm pmaxub SSE0, SSE2
__asm pmaxub SSE5, SSE7
__asm pminub SSE1, SSE4
__asm pmaxub SSE0, SSE3
__asm pminub SSE1, SSE5
__asm SSE3_MOVE SSE4, [esi + 2]
__asm SSE_RMOVE SSE2, SSE6
__asm SSE_RMOVE SSE3, SSE7
__asm SSE_RMOVE SSE5, SSE4
__asm pminub SSE2, SSE7
__asm pmaxub SSE3, SSE6
__asm pmaxub SSE0, SSE2
__asm pminub SSE5, SSE6
__asm pminub SSE1, SSE3
__asm pmaxub SSE6, SSE4
__asm pmaxub SSE0, SSE5
__asm SSE3_MOVE SSE5, [esi + 2*ebx]
__asm pminub SSE1, SSE6
__asm SSE_RMOVE SSE2, SSE5
__asm pminub SSE5, SSE7
__asm pmaxub SSE7, SSE2
__asm pmaxub SSE0, SSE5
__asm pminub SSE1, SSE7
__asm SSE_RMOVE SSE5, SSE2
__asm SSE_RMOVE SSE3, SSE4
__asm SSE3_MOVE SSE6, [esi + ebx + 2]
__asm pminub SSE2, SSE4
__asm pmaxub SSE3, SSE5
__asm SSE_RMOVE SSE7, SSE6
__asm pmaxub SSE0, SSE2
__asm pminub SSE7, SSE4
__asm SSE3_MOVE SSE2, [esi + ebx]
__asm pminub SSE1, SSE3
__asm pmaxub SSE4, SSE6
__asm SSE_RMOVE SSE3, SSE2
__asm pmaxub SSE0, SSE7
__asm movd [edi], SSE2
__asm pminub SSE3, SSE5
__asm pminub SSE1, SSE4
__asm pmaxub SSE5, SSE2
__asm pmaxub SSE0, SSE3
__asm pminub SSE1, SSE5
__asm SSE_RMOVE SSE7, SSE2
__asm SSE_RMOVE SSE3, SSE6
__asm SSE3_MOVE SSE4, [esi + 2*ebx + 2]
__asm pminub SSE2, SSE6
__asm pmaxub SSE3, SSE7
__asm SSE_RMOVE SSE5, SSE4
__asm pmaxub SSE0, SSE2
__asm pminub SSE5, SSE6
__asm SSE3_MOVE SSE2, [esi]
__asm pminub SSE1, SSE3
__asm pmaxub SSE6, SSE4
__asm pmaxub SSE0, SSE5
__asm SSE_RMOVE SSE3, SSE2
__asm pminub SSE1, SSE6
__asm pminub SSE2, SSE7
__asm pmaxub SSE3, SSE7
__asm pmaxub SSE0, SSE2
__asm pminub SSE1, SSE3
__asm SSE_RMOVE SSE2, SSE0
#if (ISSE > 1) || defined(SHLUR)
__asm SSE3_MOVE SSE4, [esi + ebx + 1]
#endif
__asm pminub SSE0, SSE1
__asm pmaxub SSE2, SSE1
#ifdef SHLUR
sharpen(SSE4, SSE0, SSE2, rshift[eax], shift_mask[eax], SSE7, SSE3)
__asm SSE_MOVE [edi + 1], SSE4
#else
#if ISSE > 1
__asm pmaxub SSE0, SSE4
#else
__asm pmaxub SSE0, [esi + ebx + 1]
#endif
__asm pminub SSE0, SSE2
__asm SSE_MOVE [edi + 1], SSE0
#endif // SHLUR
// now the pixels in the middle
__asm add esi, SSE_INCREMENT
__asm add edi, SSE_INCREMENT + 1
__asm mov ecx, hblocks
#endif // MODIFYPLUGIN
__asm align 16
__asm middle_loop:
__asm SSE3_MOVE SSE4, [esi]
__asm SSE3_MOVE SSE5, [esi + 2*ebx + 2]
__asm SSE_RMOVE SSE0, SSE4
__asm SSE_RMOVE SSE1, SSE5
__asm SSE3_MOVE SSE6, [esi + 1]
__asm pminub SSE0, SSE5
__asm SSE_RMOVE SSE2, SSE6
__asm pmaxub SSE1, SSE4
__asm SSE3_MOVE SSE7, [esi + 2*ebx + 1]
__asm pminub SSE2, SSE4
__asm SSE_RMOVE SSE3, SSE7
__asm pmaxub SSE4, SSE6
__asm pminub SSE3, SSE5
__asm pmaxub SSE0, SSE2
__asm pmaxub SSE5, SSE7
__asm pminub SSE1, SSE4
__asm pmaxub SSE0, SSE3
__asm pminub SSE1, SSE5
__asm SSE3_MOVE SSE4, [esi + 2]
__asm SSE_RMOVE SSE2, SSE6
__asm SSE_RMOVE SSE3, SSE7
__asm SSE_RMOVE SSE5, SSE4
__asm pminub SSE2, SSE7
__asm pmaxub SSE3, SSE6
__asm pmaxub SSE0, SSE2
__asm pminub SSE5, SSE6
__asm pminub SSE1, SSE3
__asm pmaxub SSE6, SSE4
__asm pmaxub SSE0, SSE5
__asm SSE3_MOVE SSE5, [esi + 2*ebx]
__asm pminub SSE1, SSE6
__asm SSE_RMOVE SSE2, SSE5
__asm pminub SSE5, SSE7
__asm pmaxub SSE7, SSE2
__asm pmaxub SSE0, SSE5
__asm pminub SSE1, SSE7
__asm SSE_RMOVE SSE5, SSE2
__asm SSE_RMOVE SSE3, SSE4
__asm SSE3_MOVE SSE6, [esi + ebx + 2]
__asm pminub SSE2, SSE4
__asm pmaxub SSE3, SSE5
__asm SSE_RMOVE SSE7, SSE6
__asm pmaxub SSE0, SSE2
__asm pminub SSE7, SSE4
__asm SSE3_MOVE SSE2, [esi + ebx]
__asm pminub SSE1, SSE3
__asm pmaxub SSE4, SSE6
__asm SSE_RMOVE SSE3, SSE2
__asm pmaxub SSE0, SSE7
__asm pminub SSE3, SSE5
__asm pminub SSE1, SSE4
__asm pmaxub SSE5, SSE2
__asm pmaxub SSE0, SSE3
__asm pminub SSE1, SSE5
__asm SSE_RMOVE SSE7, SSE2
__asm SSE_RMOVE SSE3, SSE6
__asm SSE3_MOVE SSE4, [esi + 2*ebx + 2]
__asm pminub SSE2, SSE6
__asm pmaxub SSE3, SSE7
__asm SSE_RMOVE SSE5, SSE4
__asm pmaxub SSE0, SSE2
__asm pminub SSE5, SSE6
__asm SSE3_MOVE SSE2, [esi]
__asm pminub SSE1, SSE3
__asm pmaxub SSE6, SSE4
__asm pmaxub SSE0, SSE5
__asm SSE_RMOVE SSE3, SSE2
__asm pminub SSE1, SSE6
__asm pminub SSE2, SSE7
__asm pmaxub SSE3, SSE7
__asm pmaxub SSE0, SSE2
__asm pminub SSE1, SSE3
__asm SSE_RMOVE SSE2, SSE0
#if (ISSE > 1) || defined(SHLUR)
#ifdef MODIFYPLUGIN
__asm SSE3_MOVE SSE4, [edi]
#else
__asm SSE3_MOVE SSE4, [esi + ebx + 1]
#endif
#endif
#if MODIFYPLUGIN > 0
__asm SSE3_MOVE SSE5, [esi + ebx + 1]
#endif
__asm pminub SSE0, SSE1
__asm pmaxub SSE2, SSE1
#if MODIFYPLUGIN > 0
__asm pminub SSE0, SSE5
__asm pmaxub SSE2, SSE5
#endif
#ifdef SHLUR
sharpen(SSE4, SSE0, SSE2, rshift[eax], shift_mask[eax], SSE7, SSE3)
__asm add esi, SSE_INCREMENT
__asm SSE_MOVE [edi], SSE4
#else
#if ISSE > 1
__asm pmaxub SSE0, SSE4
#else
#ifdef MODIFYPLUGIN
__asm pmaxub SSE0, [edi]
#else
__asm pmaxub SSE0, [esi + ebx + 1]
#endif
#endif
__asm pminub SSE0, SSE2
__asm add esi, SSE_INCREMENT
__asm SSE_MOVE [edi], SSE0
#endif // SHLUR
__asm add edi, SSE_INCREMENT
__asm dec ecx
__asm jnz middle_loop
// the last pixels
__asm add esi, edx
__asm SSE3_MOVE SSE4, [esi]
__asm add edi, edx
__asm SSE3_MOVE SSE5, [esi + 2*ebx + 2]
__asm SSE_RMOVE SSE0, SSE4
__asm SSE_RMOVE SSE1, SSE5
__asm SSE3_MOVE SSE6, [esi + 1]
__asm pminub SSE0, SSE5
__asm SSE_RMOVE SSE2, SSE6
__asm pmaxub SSE1, SSE4
__asm SSE3_MOVE SSE7, [esi + 2*ebx + 1]
__asm pminub SSE2, SSE4
__asm SSE_RMOVE SSE3, SSE7
__asm pmaxub SSE4, SSE6
__asm pminub SSE3, SSE5
__asm pmaxub SSE0, SSE2
__asm pmaxub SSE5, SSE7
__asm pminub SSE1, SSE4
__asm pmaxub SSE0, SSE3
__asm pminub SSE1, SSE5
__asm SSE3_MOVE SSE4, [esi + 2]
__asm SSE_RMOVE SSE2, SSE6
__asm SSE_RMOVE SSE3, SSE7
__asm SSE_RMOVE SSE5, SSE4
__asm pminub SSE2, SSE7
__asm pmaxub SSE3, SSE6
__asm pmaxub SSE0, SSE2
__asm pminub SSE5, SSE6
__asm pminub SSE1, SSE3
__asm pmaxub SSE6, SSE4
__asm pmaxub SSE0, SSE5
__asm SSE3_MOVE SSE5, [esi + 2*ebx]
__asm pminub SSE1, SSE6
__asm SSE_RMOVE SSE2, SSE5
__asm pminub SSE5, SSE7
__asm pmaxub SSE7, SSE2
__asm pmaxub SSE0, SSE5
__asm pminub SSE1, SSE7
__asm SSE_RMOVE SSE5, SSE2
__asm SSE_RMOVE SSE3, SSE4
__asm SSE3_MOVE SSE6, [esi + ebx + 2]
__asm pminub SSE2, SSE4
__asm pmaxub SSE3, SSE5
__asm SSE_RMOVE SSE7, SSE6
__asm pmaxub SSE0, SSE2
__asm SSE_MOVE [edi + 1], SSE6
__asm pminub SSE7, SSE4
__asm SSE3_MOVE SSE2, [esi + ebx]
__asm pminub SSE1, SSE3
__asm pmaxub SSE4, SSE6
__asm SSE_RMOVE SSE3, SSE2
__asm pmaxub SSE0, SSE7
__asm pminub SSE3, SSE5
__asm pminub SSE1, SSE4
__asm pmaxub SSE5, SSE2
__asm pmaxub SSE0, SSE3
__asm pminub SSE1, SSE5
__asm SSE_RMOVE SSE7, SSE2
__asm SSE_RMOVE SSE3, SSE6
__asm SSE3_MOVE SSE4, [esi + 2*ebx + 2]
__asm pminub SSE2, SSE6
__asm pmaxub SSE3, SSE7
__asm SSE_RMOVE SSE5, SSE4
__asm pmaxub SSE0, SSE2
__asm pminub SSE5, SSE6
__asm SSE3_MOVE SSE2, [esi]
__asm pminub SSE1, SSE3
__asm pmaxub SSE6, SSE4
__asm pmaxub SSE0, SSE5
__asm SSE_RMOVE SSE3, SSE2
__asm pminub SSE1, SSE6
__asm pminub SSE2, SSE7
__asm pmaxub SSE3, SSE7
__asm pmaxub SSE0, SSE2
__asm pminub SSE1, SSE3
__asm SSE_RMOVE SSE2, SSE0
#if (ISSE > 1) || defined(SHLUR)
#ifdef MODIFYPLUGIN
__asm SSE3_MOVE SSE4, [edi]
#else
__asm SSE3_MOVE SSE4, [esi + ebx + 1]
#endif
#endif
#if MODIFYPLUGIN > 0
__asm SSE3_MOVE SSE5, [esi + ebx + 1]
#endif
__asm pminub SSE0, SSE1
__asm pmaxub SSE2, SSE1
#if MODIFYPLUGIN > 0
__asm pminub SSE0, SSE5
__asm pmaxub SSE2, SSE5
#endif
#ifdef SHLUR
sharpen(SSE4, SSE0, SSE2, rshift[eax], shift_mask[eax], SSE7, SSE3)
#ifdef SHARPEN
__asm add esi, spitch
#else
__asm add esi, eax
#endif
__asm SSE_MOVE [edi], SSE4
#else
#if ISSE > 1
__asm pmaxub SSE0, SSE4
#else
#ifdef MODIFYPLUGIN
__asm pmaxub SSE0, [edi]
#else
__asm pmaxub SSE0, [esi + ebx + 1]
#endif
#endif
__asm pminub SSE0, SSE2
__asm add esi, eax
__asm SSE_MOVE [edi], SSE0
#endif // SHLUR
__asm add edi, dpitch
__asm dec height
#ifdef MODIFYPLUGIN
__asm mov ecx, hblocks
__asm jnz middle_loop
#else
__asm jnz column_loop
#endif
}
#ifdef SHARPEN
void SmartRGCL2(BYTE *dp, int dpitch, const BYTE *_sp, int spitch, int hblocks, int remainder, int incpitch, int height, int strength)
#else
void SmartRGCL2(BYTE *dp, int dpitch, const BYTE *_sp, int spitch, int hblocks, int remainder, int incpitch, int height)
#endif
{
__asm mov eax, hblocks
__asm mov ebx, spitch
#ifdef MODIFYPLUGIN
__asm mov ecx, eax
#endif
__asm mov edx, remainder
#if SSE_INCREMENT == 16
__asm add eax, eax
#endif
__asm mov esi, _sp
#ifdef MODIFYPLUGIN
__asm lea eax, [eax * 8 + edx]
#else
__asm lea eax, [eax * 8 + edx + SSE_INCREMENT + 1]
#endif
__asm sub esi, ebx
__asm sub dpitch, eax
__asm neg eax
__asm mov edi, dp
#ifdef MODIFYPLUGIN
__asm inc edi
__asm lea eax, [ebx + eax]
#else
__asm lea eax, [ebx + eax + 1]
#ifdef SHARPEN
__asm mov spitch, eax
__asm mov eax, strength
#endif
__asm align 16
__asm column_loop:
__asm SSE3_MOVE SSE7, [esi]
__asm SSE3_MOVE SSE6, [esi + 1]
__asm SSE_RMOVE SSE0, SSE7
__asm SSE_RMOVE SSE1, SSE6
__asm SSE3_MOVE SSE5, [esi + 2]
__asm pminub SSE0, SSE6
__asm SSE_RMOVE SSE2, SSE5
__asm pmaxub SSE1, SSE7
__asm pminub SSE2, SSE6
__asm SSE3_MOVE SSE4, [esi + ebx + 2]
__asm pmaxub SSE6, SSE5
__asm pmaxub SSE0, SSE2
__asm SSE_RMOVE SSE3, SSE4
__asm pminub SSE1, SSE6
__asm pminub SSE3, SSE5
__asm SSE3_MOVE SSE6, [esi + 2*ebx + 2]
__asm pmaxub SSE5, SSE4
__asm pmaxub SSE0, SSE3
__asm SSE_RMOVE SSE2, SSE6
__asm pminub SSE1, SSE5
__asm pminub SSE2, SSE4
__asm SSE3_MOVE SSE5, [esi + 2*ebx + 1]
__asm pmaxub SSE4, SSE6
__asm pmaxub SSE0, SSE2
__asm SSE_RMOVE SSE3, SSE5
__asm pminub SSE1, SSE4
__asm pminub SSE3, SSE6
__asm SSE3_MOVE SSE4, [esi + 2*ebx]
__asm pmaxub SSE6, SSE5
__asm pmaxub SSE0, SSE3
__asm SSE_RMOVE SSE2, SSE4
__asm pminub SSE1, SSE6
__asm pminub SSE2, SSE5
__asm SSE3_MOVE SSE6, [esi + ebx]
__asm pmaxub SSE5, SSE4
__asm pmaxub SSE0, SSE2
__asm SSE_RMOVE SSE3, SSE6
__asm pminub SSE1, SSE5
__asm movd [edi], SSE6
__asm pminub SSE3, SSE4
__asm SSE_RMOVE SSE2, SSE7
__asm pmaxub SSE4, SSE6
__asm pmaxub SSE0, SSE3
__asm pminub SSE1, SSE4
__asm pminub SSE2, SSE6
__asm pmaxub SSE7, SSE6
__asm pmaxub SSE0, SSE2
__asm pminub SSE1, SSE7
__asm SSE3_MOVE SSE2, [esi]
__asm SSE3_MOVE SSE7, [esi + 2*ebx + 2]
__asm SSE_RMOVE SSE3, SSE2
__asm SSE3_MOVE SSE4, [esi + 2]
__asm pminub SSE2, SSE7
__asm SSE3_MOVE SSE6, [esi + 2*ebx]
__asm SSE_RMOVE SSE5, SSE4
__asm pmaxub SSE3, SSE7
__asm pmaxub SSE0, SSE2
__asm pminub SSE1, SSE3
__asm pminub SSE4, SSE6
__asm SSE3_MOVE SSE2, [esi + 1]
__asm pmaxub SSE5, SSE6
__asm pmaxub SSE0, SSE4
__asm SSE3_MOVE SSE7, [esi + 2*ebx + 1]
__asm SSE_RMOVE SSE3, SSE2
__asm pminub SSE1, SSE5
__asm pminub SSE2, SSE7
__asm SSE3_MOVE SSE4, [esi + ebx]
__asm pmaxub SSE3, SSE7
__asm pmaxub SSE0, SSE2
__asm SSE3_MOVE SSE7, [esi + ebx + 2]
__asm SSE_RMOVE SSE5, SSE4
__asm pminub SSE1, SSE3
__asm pminub SSE4, SSE7
__asm pmaxub SSE5, SSE7
__asm pmaxub SSE0, SSE4
__asm pminub SSE1, SSE5
__asm SSE_RMOVE SSE2, SSE0
#if (ISSE > 1) || defined(SHLUR)
__asm SSE3_MOVE SSE4, [esi + ebx + 1]
#endif
__asm pminub SSE0, SSE1
__asm pmaxub SSE2, SSE1
#ifdef SHLUR
sharpen(SSE4, SSE0, SSE2, rshift[eax], shift_mask[eax], SSE7, SSE3)
__asm SSE_MOVE [edi + 1], SSE4
#else
#if ISSE > 1
__asm pmaxub SSE0, SSE4
#else
__asm pmaxub SSE0, [esi + ebx + 1]
#endif
__asm pminub SSE0, SSE2
__asm SSE_MOVE [edi + 1], SSE0
#endif // SHLUR
// now the pixels in the middle
__asm add esi, SSE_INCREMENT
__asm add edi, SSE_INCREMENT + 1
__asm mov ecx, hblocks
#endif // MODIFYPLUGIN
__asm align 16
__asm middle_loop:
__asm SSE3_MOVE SSE7, [esi]
__asm SSE3_MOVE SSE6, [esi + 1]
__asm SSE_RMOVE SSE0, SSE7
__asm SSE_RMOVE SSE1, SSE6
__asm SSE3_MOVE SSE5, [esi + 2]
__asm pminub SSE0, SSE6
__asm SSE_RMOVE SSE2, SSE5
__asm pmaxub SSE1, SSE7
__asm pminub SSE2, SSE6
__asm SSE3_MOVE SSE4, [esi + ebx + 2]
__asm pmaxub SSE6, SSE5
__asm pmaxub SSE0, SSE2
__asm SSE_RMOVE SSE3, SSE4
__asm pminub SSE1, SSE6
__asm pminub SSE3, SSE5
__asm SSE3_MOVE SSE6, [esi + 2*ebx + 2]
__asm pmaxub SSE5, SSE4
__asm pmaxub SSE0, SSE3
__asm SSE_RMOVE SSE2, SSE6
__asm pminub SSE1, SSE5
__asm pminub SSE2, SSE4
__asm SSE3_MOVE SSE5, [esi + 2*ebx + 1]
__asm pmaxub SSE4, SSE6
__asm pmaxub SSE0, SSE2
__asm SSE_RMOVE SSE3, SSE5
__asm pminub SSE1, SSE4
__asm pminub SSE3, SSE6
__asm SSE3_MOVE SSE4, [esi + 2*ebx]
__asm pmaxub SSE6, SSE5
__asm pmaxub SSE0, SSE3
__asm SSE_RMOVE SSE2, SSE4
__asm pminub SSE1, SSE6
__asm pminub SSE2, SSE5
__asm SSE3_MOVE SSE6, [esi + ebx]
__asm pmaxub SSE5, SSE4
__asm pmaxub SSE0, SSE2
__asm SSE_RMOVE SSE3, SSE6
__asm pminub SSE1, SSE5
__asm pminub SSE3, SSE4
__asm SSE_RMOVE SSE2, SSE7
__asm pmaxub SSE4, SSE6
__asm pmaxub SSE0, SSE3
__asm pminub SSE1, SSE4
__asm pminub SSE2, SSE6
__asm pmaxub SSE7, SSE6
__asm pmaxub SSE0, SSE2
__asm pminub SSE1, SSE7
__asm SSE3_MOVE SSE2, [esi]
__asm SSE3_MOVE SSE7, [esi + 2*ebx + 2]
__asm SSE_RMOVE SSE3, SSE2
__asm SSE3_MOVE SSE4, [esi + 2]
__asm pminub SSE2, SSE7
__asm SSE3_MOVE SSE6, [esi + 2*ebx]
__asm SSE_RMOVE SSE5, SSE4
__asm pmaxub SSE3, SSE7
__asm pmaxub SSE0, SSE2
__asm pminub SSE1, SSE3
__asm pminub SSE4, SSE6
__asm SSE3_MOVE SSE2, [esi + 1]
__asm pmaxub SSE5, SSE6
__asm pmaxub SSE0, SSE4
__asm SSE3_MOVE SSE7, [esi + 2*ebx + 1]
__asm SSE_RMOVE SSE3, SSE2
__asm pminub SSE1, SSE5
__asm pminub SSE2, SSE7
__asm SSE3_MOVE SSE4, [esi + ebx]
__asm pmaxub SSE3, SSE7
__asm pmaxub SSE0, SSE2
__asm SSE3_MOVE SSE7, [esi + ebx + 2]
__asm SSE_RMOVE SSE5, SSE4
__asm pminub SSE1, SSE3
__asm pminub SSE4, SSE7
__asm pmaxub SSE5, SSE7
__asm pmaxub SSE0, SSE4
__asm pminub SSE1, SSE5
__asm SSE_RMOVE SSE2, SSE0
#if (ISSE > 1) || defined(SHLUR)
#ifdef MODIFYPLUGIN
__asm SSE3_MOVE SSE4, [edi]
#else
__asm SSE3_MOVE SSE4, [esi + ebx + 1]
#endif
#endif
#if MODIFYPLUGIN > 0
__asm SSE3_MOVE SSE5, [esi + ebx + 1]
#endif
__asm pminub SSE0, SSE1
__asm pmaxub SSE2, SSE1
#if MODIFYPLUGIN > 0
__asm pminub SSE0, SSE5
__asm pmaxub SSE2, SSE5
#endif
#ifdef SHLUR
sharpen(SSE4, SSE0, SSE2, rshift[eax], shift_mask[eax], SSE7, SSE3)
__asm add esi, SSE_INCREMENT
__asm SSE_MOVE [edi], SSE4
#else
#if ISSE > 1
__asm pmaxub SSE0, SSE4
#else
#ifdef MODIFYPLUGIN
__asm pmaxub SSE0, [edi]
#else
__asm pmaxub SSE0, [esi + ebx + 1]
#endif
#endif
__asm pminub SSE0, SSE2
__asm add esi, SSE_INCREMENT
__asm SSE_MOVE [edi], SSE0
#endif // SHLUR
__asm add edi, SSE_INCREMENT
__asm dec ecx
__asm jnz middle_loop
// the last pixels
__asm add esi, edx
__asm add edi, edx
__asm SSE3_MOVE SSE7, [esi]
__asm SSE3_MOVE SSE6, [esi + 1]
__asm SSE_RMOVE SSE0, SSE7
__asm SSE_RMOVE SSE1, SSE6
__asm SSE3_MOVE SSE5, [esi + 2]
__asm pminub SSE0, SSE6
__asm SSE_RMOVE SSE2, SSE5
__asm pmaxub SSE1, SSE7
__asm pminub SSE2, SSE6
__asm SSE3_MOVE SSE4, [esi + ebx + 2]
__asm pmaxub SSE6, SSE5
__asm pmaxub SSE0, SSE2
__asm SSE_RMOVE SSE3, SSE4
__asm pminub SSE1, SSE6
__asm pminub SSE3, SSE5
__asm SSE3_MOVE SSE6, [esi + 2*ebx + 2]
__asm pmaxub SSE5, SSE4
__asm pmaxub SSE0, SSE3
__asm SSE_RMOVE SSE2, SSE6
__asm pminub SSE1, SSE5
__asm pminub SSE2, SSE4
__asm SSE3_MOVE SSE5, [esi + 2*ebx + 1]
__asm pmaxub SSE4, SSE6
__asm pmaxub SSE0, SSE2
__asm SSE_RMOVE SSE3, SSE5
__asm pminub SSE1, SSE4
__asm pminub SSE3, SSE6
__asm SSE3_MOVE SSE4, [esi + 2*ebx]
__asm pmaxub SSE6, SSE5
__asm pmaxub SSE0, SSE3
__asm SSE_RMOVE SSE2, SSE4
__asm pminub SSE1, SSE6
__asm pminub SSE2, SSE5
__asm SSE3_MOVE SSE6, [esi + ebx]
__asm pmaxub SSE5, SSE4
__asm pmaxub SSE0, SSE2
__asm SSE_RMOVE SSE3, SSE6
__asm pminub SSE1, SSE5
__asm pminub SSE3, SSE4
__asm SSE_RMOVE SSE2, SSE7
__asm pmaxub SSE4, SSE6
__asm pmaxub SSE0, SSE3
__asm pminub SSE1, SSE4
__asm pminub SSE2, SSE6
__asm pmaxub SSE7, SSE6
__asm pmaxub SSE0, SSE2
__asm pminub SSE1, SSE7
__asm SSE3_MOVE SSE2, [esi]
__asm SSE3_MOVE SSE7, [esi + 2*ebx + 2]
__asm SSE_RMOVE SSE3, SSE2
__asm SSE3_MOVE SSE4, [esi + 2]
__asm pminub SSE2, SSE7
__asm SSE3_MOVE SSE6, [esi + 2*ebx]
__asm SSE_RMOVE SSE5, SSE4
__asm pmaxub SSE3, SSE7
__asm pmaxub SSE0, SSE2
__asm pminub SSE1, SSE3
__asm pminub SSE4, SSE6
__asm SSE3_MOVE SSE2, [esi + 1]
__asm pmaxub SSE5, SSE6
__asm pmaxub SSE0, SSE4
__asm SSE3_MOVE SSE7, [esi + 2*ebx + 1]
__asm SSE_RMOVE SSE3, SSE2
__asm pminub SSE1, SSE5
__asm pminub SSE2, SSE7
__asm SSE3_MOVE SSE4, [esi + ebx]
__asm pmaxub SSE3, SSE7
__asm pmaxub SSE0, SSE2
__asm SSE3_MOVE SSE7, [esi + ebx + 2]
__asm SSE_RMOVE SSE5, SSE4
__asm pminub SSE1, SSE3
__asm pminub SSE4, SSE7
__asm pmaxub SSE5, SSE7
__asm pmaxub SSE0, SSE4
__asm SSE_MOVE [edi + 1], SSE7
__asm pminub SSE1, SSE5
__asm SSE_RMOVE SSE2, SSE0
#if (ISSE > 1) || defined(SHLUR)
#ifdef MODIFYPLUGIN
__asm SSE3_MOVE SSE4, [edi]
#else
__asm SSE3_MOVE SSE4, [esi + ebx + 1]
#endif
#endif
#if MODIFYPLUGIN > 0
__asm SSE3_MOVE SSE5, [esi + ebx + 1]
#endif
__asm pminub SSE0, SSE1
__asm pmaxub SSE2, SSE1
#if MODIFYPLUGIN > 0
__asm pminub SSE0, SSE5
__asm pmaxub SSE2, SSE5
#endif
#ifdef SHLUR
sharpen(SSE4, SSE0, SSE2, rshift[eax], shift_mask[eax], SSE7, SSE3)
#ifdef SHARPEN
__asm add esi, spitch
#else
__asm add esi, eax
#endif
__asm SSE_MOVE [edi], SSE4
#else
#if ISSE > 1
__asm pmaxub SSE0, SSE4
#else
#ifdef MODIFYPLUGIN
__asm pmaxub SSE0, [edi]
#else
__asm pmaxub SSE0, [esi + ebx + 1]
#endif
#endif
__asm pminub SSE0, SSE2
__asm add esi, eax
__asm SSE_MOVE [edi], SSE0
#endif // SHLUR
__asm add edi, dpitch
__asm dec height
#ifdef MODIFYPLUGIN
__asm mov ecx, hblocks
__asm jnz middle_loop
#else
__asm jnz column_loop
#endif
}
#ifndef MODIFYPLUGIN
#define SmartAvgs(upavg, downavg, mem1, mem2, bias, reg2)\
__asm SSE3_MOVE downavg, mem1 \
__asm SSE3_MOVE reg2, mem2 \
__asm SSE_RMOVE upavg, downavg \
__asm psubusb downavg, bias \
__asm pavgb upavg, reg2 \
__asm pavgb downavg, reg2
#ifdef MODIFYPLUGIN
#define SmartAvgsW(upavg, downavg, wmem, mem1, mem2, bias, reg2)\
SmartAvgs(upavg, downavg, mem1, mem2, bias, reg2)
#else
#define SmartAvgsW(upavg, downavg, wmem, mem1, mem2, bias, reg2, mwrite)\
__asm SSE3_MOVE downavg, mem1 \
__asm SSE3_MOVE reg2, mem2 \
__asm SSE_RMOVE upavg, downavg \
__asm psubusb downavg, bias \
__asm pavgb upavg, reg2 \
__asm pavgb downavg, reg2 \
__asm mwrite wmem, reg2
#endif
#ifdef SHARPEN
void SmartAvgRGs(BYTE *dp, int dpitch, const BYTE *_sp, int spitch, int hblocks, int remainder, int incpitch, int height, int strength)
#else
void SmartAvgRGs(BYTE *dp, int dpitch, const BYTE *_sp, int spitch, int hblocks, int remainder, int incpitch, int height)
#endif
{
#ifdef SHARPEN
// SSE5 is also used for a different purpose #ifdef MODIFYPLUGIN
__asm mov ebx, strength
__asm SSE_RMOVE SSE4, rshift[ebx]
__asm SSE_RMOVE SSE5, shift_mask[ebx]
#endif
__asm mov eax, hblocks
__asm mov ebx, spitch
__asm SSE_RMOVE SSE7, fconvolution_bias
#ifdef MODIFYPLUGIN
__asm mov ecx, eax
#endif
__asm mov edx, remainder
#if SSE_INCREMENT == 16
__asm add eax, eax
#endif
__asm mov esi, _sp
#ifdef MODIFYPLUGIN
__asm lea eax, [eax * 8 + edx]
#else
__asm lea eax, [eax * 8 + edx + SSE_INCREMENT + 1]
#endif
__asm sub esi, ebx
__asm sub dpitch, eax
__asm neg eax
__asm mov edi, dp
#ifdef MODIFYPLUGIN
__asm inc edi
__asm lea eax, [ebx + eax]
#else
__asm lea eax, [ebx + eax + 1]
__asm align 16
__asm column_loop:
SmartAvgs(SSE0, SSE1, [esi], [esi + 2*ebx + 2], SSE7, SSE6)
SmartAvgs(SSE2, SSE3, [esi + 2], [esi + 2*ebx], SSE7, SSE6)
__asm pmaxub SSE0, SSE2
__asm pminub SSE1, SSE3
SmartAvgs(SSE2, SSE3, [esi + 1], [esi + 2*ebx + 1], SSE7, SSE6)
__asm pmaxub SSE0, SSE2
__asm pminub SSE1, SSE3
SmartAvgsW(SSE2, SSE3, [edi], [esi + ebx + 2], [esi + ebx], SSE7, SSE6, movd)
__asm pmaxub SSE0, SSE2
#if (ISSE > 1) || defined(SHLUR)
__asm SSE3_MOVE SSE6, [esi + ebx + 1]
#endif
__asm pminub SSE1, SSE3
#ifdef SHLUR
sharpen(SSE6, SSE1, SSE0, SSE4, SSE5, SSE3, SSE2)
__asm SSE_MOVE [edi + 1], SSE6
#else
#if ISSE > 1
__asm pminub SSE0, SSE6
#else
__asm pminub SSE0, [esi + ebx + 1]
#endif
__asm pmaxub SSE0, SSE1
__asm SSE_MOVE [edi + 1], SSE0
#endif // SHLUR
// now the pixels in the middle
__asm add esi, SSE_INCREMENT
__asm add edi, SSE_INCREMENT + 1
__asm mov ecx, hblocks
#endif // MODIFYPLUGIN
__asm align 16
__asm middle_loop:
SmartAvgs(SSE0, SSE1, [esi], [esi + 2*ebx + 2], SSE7, SSE6)
SmartAvgs(SSE2, SSE3, [esi + 2], [esi + 2*ebx], SSE7, SSE6)
__asm pmaxub SSE0, SSE2
__asm pminub SSE1, SSE3
SmartAvgs(SSE2, SSE3, [esi + 1], [esi + 2*ebx + 1], SSE7, SSE6)
__asm pmaxub SSE0, SSE2
__asm pminub SSE1, SSE3
SmartAvgs(SSE2, SSE3, [esi + ebx], [esi + ebx + 2], SSE7, SSE6)
#ifdef MODIFYPLUGIN
__asm SSE3_MOVE SSE5, [esi + ebx + 1]
#endif
__asm pmaxub SSE0, SSE2
#if (ISSE > 1) || defined(SHLUR)
#ifdef MODIFYPLUGIN
__asm SSE3_MOVE SSE6, [edi]
#else
__asm SSE3_MOVE SSE6, [esi + ebx + 1]
#endif
#endif
__asm pminub SSE1, SSE3
#ifdef MODIFYPLUGIN
__asm pmaxub SSE0, SSE5
__asm pminub SSE1, SSE5
#endif
#ifdef SHLUR
sharpen(SSE6, SSE1, SSE0, SSE4, SSE5, SSE3, SSE2)
__asm add esi, SSE_INCREMENT
__asm SSE_MOVE [edi], SSE6
#else
#if ISSE > 1
__asm pminub SSE0, SSE6
#else
__asm pminub SSE0, [esi + ebx + 1]
#endif
__asm pmaxub SSE0, SSE1
__asm add esi, SSE_INCREMENT
__asm SSE_MOVE [edi], SSE0
#endif // SHLUR
__asm add edi, SSE_INCREMENT
__asm dec ecx
__asm jnz middle_loop
// the last pixels
__asm add esi, edx
__asm add edi, edx
SmartAvgs(SSE0, SSE1, [esi], [esi + 2*ebx + 2], SSE7, SSE6)
SmartAvgs(SSE2, SSE3, [esi + 2], [esi + 2*ebx], SSE7, SSE6)
__asm pmaxub SSE0, SSE2
__asm pminub SSE1, SSE3
SmartAvgs(SSE2, SSE3, [esi + 1], [esi + 2*ebx + 1], SSE7, SSE6)
__asm pmaxub SSE0, SSE2
__asm pminub SSE1, SSE3
SmartAvgsW(SSE2, SSE3, [edi + 1], [esi + ebx], [esi + ebx + 2], SSE7, SSE6, SSE_MOVE)
#ifdef MODIFYPLUGIN
__asm SSE3_MOVE SSE5, [esi + ebx + 1]
#endif
__asm pmaxub SSE0, SSE2
#if (ISSE > 1) || defined(SHLUR)
#ifdef MODIFYPLUGIN
__asm SSE3_MOVE SSE6, [edi]
#else
__asm SSE3_MOVE SSE6, [esi + ebx + 1]
#endif
#endif
__asm pminub SSE1, SSE3
#ifdef MODIFYPLUGIN
__asm pmaxub SSE0, SSE5
__asm pminub SSE1, SSE5
#endif
#ifdef SHLUR
sharpen(SSE6, SSE1, SSE0, SSE4, SSE5, SSE3, SSE2)
__asm add esi, eax
__asm SSE_MOVE [edi], SSE6
#else
#if ISSE > 1
__asm pminub SSE0, SSE6
#else
__asm pminub SSE0, [esi + ebx + 1]
#endif
__asm pmaxub SSE0, SSE1
__asm add esi, eax
__asm SSE_MOVE [edi], SSE0
#endif // SHLUR
__asm add edi, dpitch
__asm dec height
#ifdef MODIFYPLUGIN
__asm mov ecx, hblocks
__asm jnz middle_loop
#else
__asm jnz column_loop
#endif
}
#if ISSE > 1
#define SmartAvgf(avg1, avg2, mem1, mem2, mem3, mem4, reg1, reg2)\
__asm SSE3_MOVE avg1, mem1 \
__asm SSE3_MOVE avg2, mem3 \
__asm SSE3_MOVE reg1, mem2 \
__asm SSE3_MOVE reg2, mem4 \
__asm pavgb avg1, reg1 \
__asm pavgb avg2, reg2
#else
#define SmartAvgf(avg1, avg2, mem1, mem2, mem3, mem4, reg1, reg2)\
__asm SSE3_MOVE avg1, mem1 \
__asm SSE3_MOVE avg2, mem3 \
__asm pavgb avg1, mem2 \
__asm pavgb avg2, mem4
#endif
#ifdef MODIFYPLUGIN
#define SmartAvgfW(avg1, avg2, wmem, mem1, mem2, mem3, mem4, reg1, reg2)\
SmartAvgf(avg1, avg2, mem1, mem2, mem3, mem4, reg1, reg2)
#else
#if ISSE > 1
#define SmartAvgfW(avg1, avg2, wmem, mem1, mem2, mem3, mem4, reg1, reg2, mwrite)\
__asm SSE3_MOVE avg1, mem1 \
__asm SSE3_MOVE avg2, mem3 \
__asm SSE3_MOVE reg1, mem2 \
__asm SSE3_MOVE reg2, mem4 \
__asm pavgb avg1, reg1 \
__asm pavgb avg2, reg2 \
__asm mwrite wmem, reg2
#else
#define SmartAvgfW(avg1, avg2, wmem, mem4, mem3, mem2, mem1, reg1, reg2, mwrite)\
__asm SSE3_MOVE avg1, mem1 \
__asm SSE3_MOVE avg2, mem3 \
__asm mwrite wmem, avg1 \
__asm pavgb avg2, mem4 \
__asm pavgb avg1, mem2
#endif
#endif
#ifdef SHARPEN
void SmartAvgRGf(BYTE *dp, int dpitch, const BYTE *_sp, int spitch, int hblocks, int remainder, int incpitch, int height, int strength)
#else
void SmartAvgRGf(BYTE *dp, int dpitch, const BYTE *_sp, int spitch, int hblocks, int remainder, int incpitch, int height)
#endif
{
#ifdef SHARPEN
__asm mov ebx, strength
__asm SSE_RMOVE SSE4, rshift[ebx]
__asm SSE_RMOVE SSE7, shift_mask[ebx]
#endif
__asm mov eax, hblocks
__asm mov ebx, spitch
#ifdef MODIFYPLUGIN
__asm mov ecx, eax
#endif
__asm mov edx, remainder
#if SSE_INCREMENT == 16
__asm add eax, eax
#endif
__asm mov esi, _sp
#ifdef MODIFYPLUGIN
__asm lea eax, [eax * 8 + edx]
#else
__asm lea eax, [eax * 8 + edx + SSE_INCREMENT + 1]
#endif
__asm sub esi, ebx
__asm sub dpitch, eax
__asm neg eax
__asm mov edi, dp
#ifdef MODIFYPLUGIN
__asm inc edi
__asm lea eax, [ebx + eax]
#else
__asm lea eax, [ebx + eax + 1]
__asm align 16
__asm column_loop:
SmartAvgf(SSE0, SSE3, [esi], [esi + 2*ebx + 2], [esi + 2], [esi + 2*ebx], SSE5, SSE6)
__asm SSE_RMOVE SSE1, SSE0
__asm pmaxub SSE0, SSE3
__asm pminub SSE1, SSE3
SmartAvgfW(SSE2, SSE3, [edi], [esi + 1], [esi + 2*ebx + 1], [esi + ebx + 2], [esi + ebx], SSE5, SSE6, movd)
__asm pmaxub SSE0, SSE2
__asm pminub SSE1, SSE3
#if (ISSE > 1) || defined(SHLUR)
__asm SSE3_MOVE SSE6, [esi + ebx + 1]
#endif
__asm pmaxub SSE0, SSE3
__asm pminub SSE1, SSE2
#ifdef SHLUR
sharpen(SSE6, SSE1, SSE0, SSE4, SSE7, SSE2, SSE3)
__asm SSE_MOVE [edi + 1], SSE6
#else
#if ISSE > 1
__asm pminub SSE0, SSE6
#else
__asm pminub SSE0, [esi + ebx + 1]
#endif
__asm pmaxub SSE0, SSE1
__asm SSE_MOVE [edi + 1], SSE0
#endif // SHLUR
// now the pixels in the middle
__asm add esi, SSE_INCREMENT
__asm add edi, SSE_INCREMENT + 1
__asm mov ecx, hblocks
#endif // MODIFYPLUGIN
__asm align 16
__asm middle_loop:
SmartAvgf(SSE0, SSE3, [esi], [esi + 2*ebx + 2], [esi + 2], [esi + 2*ebx], SSE5, SSE6)
__asm SSE_RMOVE SSE1, SSE0
__asm pmaxub SSE0, SSE3
__asm pminub SSE1, SSE3
SmartAvgf(SSE2, SSE3, [esi + 1], [esi + 2*ebx + 1], [esi + ebx], [esi + ebx + 2], SSE5, SSE6)
__asm pmaxub SSE0, SSE2
#ifdef MODIFYPLUGIN
__asm SSE3_MOVE SSE5, [esi + ebx + 1]
#endif
__asm pminub SSE1, SSE3
#if (ISSE > 1) || defined(SHLUR)
#ifdef MODIFYPLUGIN
__asm SSE3_MOVE SSE6, [edi]
#else
__asm SSE3_MOVE SSE6, [esi + ebx + 1]
#endif
#endif
__asm pmaxub SSE0, SSE3
__asm pminub SSE1, SSE2
#ifdef MODIFYPLUGIN
__asm pmaxub SSE0, SSE5
__asm pminub SSE1, SSE5
#endif
#ifdef SHLUR
sharpen(SSE6, SSE1, SSE0, SSE4, SSE7, SSE2, SSE3)
__asm add esi, SSE_INCREMENT
__asm SSE_MOVE [edi], SSE6
#else
#if ISSE > 1
__asm pminub SSE0, SSE6
#else
#ifdef MODIFYPLUGIN
__asm pminub SSE0, [edi]
#else
__asm pminub SSE0, [esi + ebx + 1]
#endif
#endif // SHLUR
__asm pmaxub SSE0, SSE1
__asm add esi, SSE_INCREMENT
__asm SSE_MOVE [edi], SSE0
#endif
__asm add edi, SSE_INCREMENT
__asm dec ecx
__asm jnz middle_loop
// the last pixels
__asm add esi, edx
__asm add edi, edx
SmartAvgf(SSE0, SSE3, [esi], [esi + 2*ebx + 2], [esi + 2], [esi + 2*ebx], SSE5, SSE6)
__asm SSE_RMOVE SSE1, SSE0
__asm pmaxub SSE0, SSE3
__asm pminub SSE1, SSE3
SmartAvgfW(SSE2, SSE3, [edi + 1], [esi + 1], [esi + 2*ebx + 1], [esi + ebx], [esi + ebx + 2], SSE5, SSE6, SSE_MOVE)
__asm pmaxub SSE0, SSE2
#ifdef MODIFYPLUGIN
__asm SSE3_MOVE SSE5, [esi + ebx + 1]
#endif
__asm pminub SSE1, SSE3
#if (ISSE > 1) || defined(SHLUR)
#ifdef MODIFYPLUGIN
__asm SSE3_MOVE SSE6, [edi]
#else
__asm SSE3_MOVE SSE6, [esi + ebx + 1]
#endif
#endif
__asm pmaxub SSE0, SSE3
__asm pminub SSE1, SSE2
#ifdef MODIFYPLUGIN
__asm pmaxub SSE0, SSE5
__asm pminub SSE1, SSE5
#endif
#ifdef SHLUR
sharpen(SSE6, SSE1, SSE0, SSE4, SSE7, SSE2, SSE3)
__asm add esi, eax
__asm SSE_MOVE [edi], SSE6
#else
#if ISSE > 1
__asm pminub SSE0, SSE6
#else
#ifdef MODIFYPLUGIN
__asm pminub SSE0, [edi]
#else
__asm pminub SSE0, [esi + ebx + 1]
#endif
#endif
__asm pmaxub SSE0, SSE1
__asm add esi, eax
__asm SSE_MOVE [edi], SSE0
#endif // SHLUR
__asm add edi, dpitch
__asm dec height
#ifdef MODIFYPLUGIN
__asm mov ecx, hblocks
__asm jnz middle_loop
#else
__asm jnz column_loop
#endif
}
#endif // #ifndef MODIFYPLUGIN
#ifdef MODIFYPLUGIN
void SSE_Repair12(BYTE *dp, int dpitch, const BYTE *_sp, int spitch, int hblocks, int remainder, int incpitch, int height)
{
__asm mov eax, hblocks
__asm mov ebx, spitch
#ifdef MODIFYPLUGIN
__asm mov ecx, eax
#endif
__asm mov edx, remainder
#if SSE_INCREMENT == 16
__asm add eax, eax
#endif
__asm mov esi, _sp
#ifdef MODIFYPLUGIN
__asm lea eax, [eax * 8 + edx]
#else
__asm lea eax, [eax * 8 + edx + SSE_INCREMENT + 1]
#endif
__asm sub esi, ebx
__asm sub dpitch, eax
__asm neg eax
__asm mov edi, dp
#ifdef MODIFYPLUGIN
__asm inc edi
__asm lea eax, [ebx + eax]
#else
__asm lea eax, [ebx + eax + 1]
__asm align 16
__asm column_loop:
__asm SSE3_MOVE SSE0, [esi]
__asm SSE3_MOVE SSE7, [esi + 1]
add2(SSE0, SSE1, SSE7)
__asm SSE3_MOVE SSE6, [esi + 2]
__asm SSE3_MOVE SSE7, [esi + ebx]
add3(SSE0, SSE1, SSE2, SSE6)
__asm movd [edi], SSE7
__asm SSE3_MOVE SSE6, [esi + ebx + 2]
add4(SSE0, SSE1, SSE2, SSE3, SSE7)
__asm SSE3_MOVE SSE5, [esi + 2*ebx]
minmax2(SSE0, SSE1, SSE2, SSE3, SSE6)
__asm SSE3_MOVE SSE7, [esi + 2*ebx + 1]
minmax2(SSE0, SSE1, SSE2, SSE3, SSE5)
__asm SSE3_MOVE SSE6, [esi + 2*ebx + 2]
minmax2(SSE0, SSE1, SSE2, SSE3, SSE7)
#if ISSE > 1
__asm SSE3_MOVE SSE5, [esi + ebx + 1]
#endif
minmax2sub(SSE0, SSE1, SSE2, SSE3, SSE6)
#if ISSE > 1
__asm pmaxub SSE2, SSE5
#else
__asm pmaxub SSE2, [esi + ebx + 1]
#endif
__asm pminub SSE1, SSE2
__asm SSE_MOVE [edi + 1], SSE1
// now the pixels in the middle
__asm add esi, SSE_INCREMENT
__asm add edi, SSE_INCREMENT + 1
__asm mov ecx, hblocks
#endif // MODIFYPLUGIN
__asm align 16
__asm middle_loop:
__asm SSE3_MOVE SSE0, [esi]
__asm SSE3_MOVE SSE7, [esi + 1]
add2(SSE0, SSE1, SSE7)
__asm SSE3_MOVE SSE6, [esi + 2]
__asm SSE3_MOVE SSE7, [esi + ebx]
add3(SSE0, SSE1, SSE2, SSE6)
__asm SSE3_MOVE SSE6, [esi + ebx + 2]
add4(SSE0, SSE1, SSE2, SSE3, SSE7)
__asm SSE3_MOVE SSE5, [esi + 2*ebx]
minmax2(SSE0, SSE1, SSE2, SSE3, SSE6)
__asm SSE3_MOVE SSE7, [esi + 2*ebx + 1]
minmax2(SSE0, SSE1, SSE2, SSE3, SSE5)
__asm SSE3_MOVE SSE6, [esi + 2*ebx + 2]
minmax2(SSE0, SSE1, SSE2, SSE3, SSE7)
#ifdef MODIFYPLUGIN
__asm SSE3_MOVE SSE4, [esi + ebx + 1]
#endif
#if ISSE > 1
#ifdef MODIFYPLUGIN
__asm SSE3_MOVE SSE5, [edi]
#else
__asm SSE3_MOVE SSE5, [esi + ebx + 1]
#endif
#endif
minmax2sub(SSE0, SSE1, SSE2, SSE3, SSE6)
#ifdef MODIFYPLUGIN
__asm pminub SSE2, SSE4
__asm pmaxub SSE1, SSE4
#endif
#if ISSE > 1
__asm pmaxub SSE2, SSE5
#else
#ifdef MODIFYPLUGIN
__asm pmaxub SSE2, [edi]
#else
__asm pmaxub SSE2, [esi + ebx + 1]
#endif
#endif // ISSE > 1
__asm pminub SSE1, SSE2
__asm add esi, SSE_INCREMENT
__asm SSE_MOVE [edi], SSE1
__asm add edi, SSE_INCREMENT
__asm dec ecx
__asm jnz middle_loop
// the last pixels
__asm add esi, edx
__asm add edi, edx
__asm SSE3_MOVE SSE0, [esi]
__asm SSE3_MOVE SSE7, [esi + 1]
add2(SSE0, SSE1, SSE7)
__asm SSE3_MOVE SSE6, [esi + 2]
__asm SSE3_MOVE SSE7, [esi + ebx]
add3(SSE0, SSE1, SSE2, SSE6)
__asm SSE3_MOVE SSE6, [esi + ebx + 2]
add4(SSE0, SSE1, SSE2, SSE3, SSE7)
#ifndef MODIFYPLUGIN
__asm SSE_MOVE [edi + 1], SSE6
#endif
__asm SSE3_MOVE SSE5, [esi + 2*ebx]
minmax2(SSE0, SSE1, SSE2, SSE3, SSE6)
__asm SSE3_MOVE SSE7, [esi + 2*ebx + 1]
minmax2(SSE0, SSE1, SSE2, SSE3, SSE5)
__asm SSE3_MOVE SSE6, [esi + 2*ebx + 2]
minmax2(SSE0, SSE1, SSE2, SSE3, SSE7)
#ifdef MODIFYPLUGIN
__asm SSE3_MOVE SSE4, [esi + ebx + 1]
#endif
#if ISSE > 1
#ifdef MODIFYPLUGIN
__asm SSE3_MOVE SSE5, [edi]
#else
__asm SSE3_MOVE SSE5, [esi + ebx + 1]
#endif
#endif
minmax2sub(SSE0, SSE1, SSE2, SSE3, SSE6)
#ifdef MODIFYPLUGIN
__asm pminub SSE2, SSE4
__asm pmaxub SSE1, SSE4
#endif
#if ISSE > 1
__asm pmaxub SSE2, SSE5
#else
#ifdef MODIFYPLUGIN
__asm pmaxub SSE2, [edi]
#else
__asm pmaxub SSE2, [esi + ebx + 1]
#endif
#endif // ISSE > 1
__asm pminub SSE1, SSE2
__asm add esi, eax
__asm SSE_MOVE [edi], SSE1
__asm add edi, dpitch
__asm dec height
#ifdef MODIFYPLUGIN
__asm mov ecx, hblocks
__asm jnz middle_loop
#else
__asm jnz column_loop
#endif
}
void SSE_Repair13(BYTE *dp, int dpitch, const BYTE *_sp, int spitch, int hblocks, int remainder, int incpitch, int height)
{
__asm mov eax, hblocks
__asm mov ebx, spitch
#ifdef MODIFYPLUGIN
__asm mov ecx, eax
#endif
__asm mov edx, remainder
#if SSE_INCREMENT == 16
__asm add eax, eax
#endif
__asm mov esi, _sp
#ifdef MODIFYPLUGIN
__asm lea eax, [eax * 8 + edx]
#else
__asm lea eax, [eax * 8 + edx + SSE_INCREMENT + 1]
#endif
__asm sub esi, ebx
__asm sub dpitch, eax
__asm neg eax
__asm mov edi, dp
#ifdef MODIFYPLUGIN
__asm inc edi
__asm lea eax, [ebx + eax]
#else
__asm lea eax, [ebx + eax + 1]
__asm align 16
__asm column_loop:
__asm SSE3_MOVE SSE0, [esi]
__asm SSE3_MOVE SSE7, [esi + 1]
add2(SSE0, SSE1, SSE7)
__asm SSE3_MOVE SSE6, [esi + 2]
__asm SSE3_MOVE SSE5, [esi + ebx]
add3(SSE0, SSE1, SSE2, SSE6)
__asm SSE3_MOVE SSE7, [esi + ebx + 2]
add4(SSE0, SSE1, SSE2, SSE3, SSE5)
__asm movd [edi], SSE5
__asm SSE3_MOVE SSE6, [esi + 2*ebx]
add5(SSE0, SSE1, SSE2, SSE3, SSE4, SSE7)
__asm SSE3_MOVE SSE7, [esi + 2*ebx + 1]
add6(SSE0, SSE1, SSE2, SSE3, SSE4, SSE5, SSE6)
__asm SSE3_MOVE SSE6, [esi + 2*ebx + 2]
minmax3sub(SSE0, SSE1, SSE2, SSE3, SSE4, SSE5, SSE7)
#if ISSE > 1
__asm SSE3_MOVE SSE7, [esi + ebx + 1]
#endif
minmax2sub(SSE1, SSE2, SSE3, SSE4, SSE6)
#if ISSE > 1
__asm pmaxub SSE3, SSE7
#else
__asm pmaxub SSE3, [esi + ebx + 1]
#endif
__asm pminub SSE3, SSE2
__asm SSE_MOVE [edi + 1], SSE3
// now the pixels in the middle
__asm add esi, SSE_INCREMENT
__asm add edi, SSE_INCREMENT + 1
__asm mov ecx, hblocks
#endif // MODIFYPLUGIN
__asm align 16
__asm middle_loop:
__asm SSE3_MOVE SSE0, [esi]
__asm SSE3_MOVE SSE7, [esi + 1]
add2(SSE0, SSE1, SSE7)
__asm SSE3_MOVE SSE6, [esi + 2]
__asm SSE3_MOVE SSE5, [esi + ebx]
add3(SSE0, SSE1, SSE2, SSE6)
__asm SSE3_MOVE SSE7, [esi + ebx + 2]
add4(SSE0, SSE1, SSE2, SSE3, SSE5)
__asm SSE3_MOVE SSE6, [esi + 2*ebx]
add5(SSE0, SSE1, SSE2, SSE3, SSE4, SSE7)
__asm SSE3_MOVE SSE7, [esi + 2*ebx + 1]
add6(SSE0, SSE1, SSE2, SSE3, SSE4, SSE5, SSE6)
__asm SSE3_MOVE SSE6, [esi + 2*ebx + 2]
minmax3sub(SSE0, SSE1, SSE2, SSE3, SSE4, SSE5, SSE7)
#ifdef MODIFYPLUGIN
__asm SSE3_MOVE SSE5, [esi + ebx + 1]
#endif
#if ISSE > 1
#ifdef MODIFYPLUGIN
__asm SSE3_MOVE SSE7, [edi]
#else
__asm SSE3_MOVE SSE7, [esi + ebx + 1]
#endif
#endif
minmax2sub(SSE1, SSE2, SSE3, SSE4, SSE6)
#ifdef MODIFYPLUGIN
__asm pminub SSE3, SSE5
__asm pmaxub SSE2, SSE5
#endif
#if ISSE > 1
__asm pmaxub SSE3, SSE7
#else
#ifdef MODIFYPLUGIN
__asm pmaxub SSE3, [edi]
#else
__asm pmaxub SSE3, [esi + ebx + 1]
#endif
#endif // ISSE > 1
__asm pminub SSE3, SSE2
__asm add esi, SSE_INCREMENT
__asm SSE_MOVE [edi], SSE3
__asm add edi, SSE_INCREMENT
__asm dec ecx
__asm jnz middle_loop
// the last pixels
__asm add esi, edx
__asm add edi, edx
__asm SSE3_MOVE SSE0, [esi]
__asm SSE3_MOVE SSE7, [esi + 1]
add2(SSE0, SSE1, SSE7)
__asm SSE3_MOVE SSE6, [esi + 2]
__asm SSE3_MOVE SSE5, [esi + ebx]
add3(SSE0, SSE1, SSE2, SSE6)
__asm SSE3_MOVE SSE7, [esi + ebx + 2]
add4(SSE0, SSE1, SSE2, SSE3, SSE5)
#ifndef MODIFYPLUGIN
__asm SSE_MOVE [edi + 1], SSE7
#endif
__asm SSE3_MOVE SSE6, [esi + 2*ebx]
add5(SSE0, SSE1, SSE2, SSE3, SSE4, SSE7)
__asm SSE3_MOVE SSE7, [esi + 2*ebx + 1]
add6(SSE0, SSE1, SSE2, SSE3, SSE4, SSE5, SSE6)
__asm SSE3_MOVE SSE6, [esi + 2*ebx + 2]
minmax3sub(SSE0, SSE1, SSE2, SSE3, SSE4, SSE5, SSE7)
#ifdef MODIFYPLUGIN
__asm SSE3_MOVE SSE5, [esi + ebx + 1]
#endif
#if ISSE > 1
#ifdef MODIFYPLUGIN
__asm SSE3_MOVE SSE7, [edi]
#else
__asm SSE3_MOVE SSE7, [esi + ebx + 1]
#endif
#endif // ISSE > 1
minmax2sub(SSE1, SSE2, SSE3, SSE4, SSE6)
#ifdef MODIFYPLUGIN
__asm pminub SSE3, SSE5
__asm pmaxub SSE2, SSE5
#endif
#if ISSE > 1
__asm pmaxub SSE3, SSE7
#else
#ifdef MODIFYPLUGIN
__asm pmaxub SSE3, [edi]
#else
__asm pmaxub SSE3, [esi + ebx + 1]
#endif
#endif // ISSE > 1
__asm pminub SSE3, SSE2
__asm add esi, eax
__asm SSE_MOVE [edi], SSE3
__asm add edi, dpitch
__asm dec height
#ifdef MODIFYPLUGIN
__asm mov ecx, hblocks
__asm jnz middle_loop
#else
__asm jnz column_loop
#endif
}
void SSE_Repair14(BYTE *dp, int dpitch, const BYTE *_sp, int spitch, int hblocks, int remainder, int incpitch, int height)
{
__asm mov eax, hblocks
__asm mov ebx, spitch
#ifdef MODIFYPLUGIN
__asm mov ecx, eax
#endif
__asm mov edx, remainder
#if SSE_INCREMENT == 16
__asm add eax, eax
#endif
__asm mov esi, _sp
#ifdef MODIFYPLUGIN
__asm lea eax, [eax * 8 + edx]
#else
__asm lea eax, [eax * 8 + edx + SSE_INCREMENT + 1]
#endif
__asm sub esi, ebx
__asm sub dpitch, eax
__asm neg eax
__asm mov edi, dp
#ifdef MODIFYPLUGIN
__asm inc edi
__asm lea eax, [ebx + eax]
#else
__asm lea eax, [ebx + eax + 1]
__asm align 16
__asm column_loop:
__asm SSE3_MOVE SSE0, [esi]
__asm SSE3_MOVE SSE7, [esi + 1]
add2(SSE0, SSE1, SSE7)
__asm SSE3_MOVE SSE6, [esi + 2]
__asm SSE3_MOVE SSE5, [esi + ebx]
add3(SSE0, SSE1, SSE2, SSE6)
__asm SSE3_MOVE SSE7, [esi + ebx + 2]
add4(SSE0, SSE1, SSE2, SSE3, SSE5)
__asm movd [edi], SSE5
__asm SSE3_MOVE SSE6, [esi + 2*ebx]
add5(SSE0, SSE1, SSE2, SSE3, SSE4, SSE7)
__asm SSE3_MOVE SSE5, [esi + 2*ebx + 1]
sub5(SSE0, SSE1, SSE2, SSE3, SSE4, SSE6)
__asm SSE3_MOVE SSE7, [esi + 2*ebx + 2]
sub4(SSE1, SSE2, SSE3, SSE4, SSE5)
#if ISSE > 1
__asm SSE3_MOVE SSE5, [esi + ebx + 1]
#endif
sub3(SSE2, SSE3, SSE4, SSE7)
#if ISSE > 1
__asm pmaxub SSE4, SSE5
#else
__asm pmaxub SSE4, [esi + ebx + 1]
#endif
__asm pminub SSE3, SSE4
__asm SSE_MOVE [edi + 1], SSE3
// now the pixels in the middle
__asm add esi, SSE_INCREMENT
__asm add edi, SSE_INCREMENT + 1
__asm mov ecx, hblocks
#endif // MODIFYPLUGIN
__asm align 16
__asm middle_loop:
__asm SSE3_MOVE SSE0, [esi]
__asm SSE3_MOVE SSE7, [esi + 1]
add2(SSE0, SSE1, SSE7)
__asm SSE3_MOVE SSE6, [esi + 2]
__asm SSE3_MOVE SSE5, [esi + ebx]
add3(SSE0, SSE1, SSE2, SSE6)
__asm SSE3_MOVE SSE7, [esi + ebx + 2]
add4(SSE0, SSE1, SSE2, SSE3, SSE5)
__asm SSE3_MOVE SSE6, [esi + 2*ebx]
add5(SSE0, SSE1, SSE2, SSE3, SSE4, SSE7)
__asm SSE3_MOVE SSE5, [esi + 2*ebx + 1]
sub5(SSE0, SSE1, SSE2, SSE3, SSE4, SSE6)
__asm SSE3_MOVE SSE7, [esi + 2*ebx + 2]
sub4(SSE1, SSE2, SSE3, SSE4, SSE5)
#ifdef MODIFYPLUGIN
__asm SSE3_MOVE SSE0, [esi + ebx + 1]
#endif // MODIFYPLUGIN
#if ISSE > 1
#ifdef MODIFYPLUGIN
__asm SSE3_MOVE SSE5, [edi]
#else
__asm SSE3_MOVE SSE5, [esi + ebx + 1]
#endif
#endif
sub3(SSE2, SSE3, SSE4, SSE7)
#ifdef MODIFYPLUGIN
__asm pminub SSE4, SSE0
__asm pmaxub SSE3, SSE0
#endif // MODIFYPLUGIN
#if ISSE > 1
__asm pmaxub SSE4, SSE5
#else
#ifdef MODIFYPLUGIN
__asm pmaxub SSE4, [edi]
#else // ISSE > 1
__asm pmaxub SSE4, [esi + ebx + 1]
#endif
#endif // ISSE > 1
__asm add esi, SSE_INCREMENT
__asm pminub SSE3, SSE4
__asm SSE_MOVE [edi], SSE3
__asm add edi, SSE_INCREMENT
__asm dec ecx
__asm jnz middle_loop
// the last pixels
__asm add esi, edx
__asm add edi, edx
__asm SSE3_MOVE SSE0, [esi]
__asm SSE3_MOVE SSE7, [esi + 1]
add2(SSE0, SSE1, SSE7)
__asm SSE3_MOVE SSE6, [esi + 2]
__asm SSE3_MOVE SSE5, [esi + ebx]
add3(SSE0, SSE1, SSE2, SSE6)
__asm SSE3_MOVE SSE7, [esi + ebx + 2]
add4(SSE0, SSE1, SSE2, SSE3, SSE5)
#ifndef MODIFYPLUGIN
__asm SSE_MOVE [edi + 1], SSE7
#endif
__asm SSE3_MOVE SSE6, [esi + 2*ebx]
add5(SSE0, SSE1, SSE2, SSE3, SSE4, SSE7)
__asm SSE3_MOVE SSE5, [esi + 2*ebx + 1]
sub5(SSE0, SSE1, SSE2, SSE3, SSE4, SSE6)
__asm SSE3_MOVE SSE7, [esi + 2*ebx + 2]
sub4(SSE1, SSE2, SSE3, SSE4, SSE5)
#ifdef MODIFYPLUGIN
__asm SSE3_MOVE SSE0, [esi + ebx + 1]
#endif
#if ISSE > 1
#ifdef MODIFYPLUGIN
__asm SSE3_MOVE SSE5, [edi]
#else
__asm SSE3_MOVE SSE5, [esi + ebx + 1]
#endif
#endif
sub3(SSE2, SSE3, SSE4, SSE7)
#ifdef MODIFYPLUGIN
__asm pminub SSE4, SSE0
__asm pmaxub SSE3, SSE0
#endif
#if ISSE > 1
__asm pmaxub SSE4, SSE5
#else
#ifdef MODIFYPLUGIN
__asm pmaxub SSE4, [edi]
#else // ISSE > 1
__asm pmaxub SSE4, [esi + ebx + 1]
#endif
#endif // ISSE > 1
__asm pminub SSE3, SSE4
__asm add esi, eax
__asm SSE_MOVE [edi], SSE3
__asm add edi, dpitch
__asm dec height
#ifdef MODIFYPLUGIN
__asm mov ecx, hblocks
__asm jnz middle_loop
#else
__asm jnz column_loop
#endif
}
#endif // MODIFYPLUGIN
#if defined(MODIFYPLUGIN) || defined(SHLUR)
#define diagweightr5(min, max, weight, center, bound1, bound2, reg) \
__asm SSE3_MOVE min, bound1 \
__asm SSE3_MOVE reg, bound2 \
__asm SSE_RMOVE max, min \
__asm SSE_RMOVE weight, center \
__asm pminub min, reg \
__asm pmaxub max, reg \
__asm SSE_RMOVE reg, min \
__asm psubusb weight, max \
__asm psubusb reg, center \
__asm pmaxub weight, reg
#if defined(MODIFYPLUGIN)
#define diagweightwr5(min, max, weight, center, bound1, bound2, wmem, reg) diagweightr5(min, max, weight, center, bound1, bound2, reg)
#else
// same as diagweight_5, but in addition bound2 is written to wmem
#define diagweightwr5(min, max, weight, center, bound1, bound2, wmem, reg) \
__asm SSE3_MOVE min, bound1 \
__asm SSE3_MOVE reg, bound2 \
__asm SSE_RMOVE max, min \
__asm SSE_MOVE wmem, reg \
__asm SSE_RMOVE weight, center \
__asm pminub min, reg \
__asm pmaxub max, reg \
__asm SSE_RMOVE reg, min \
__asm psubusb weight, max \
__asm psubusb reg, center \
__asm pmaxub weight, reg
#endif // MODIFYPLUGIN
#ifdef SHARPEN
void SSE_Repair15(BYTE *dp, int dpitch, const BYTE *_sp, int spitch, int hblocks, int remainder, int incpitch, int height, int strength)
#else
void SSE_Repair15(BYTE *dp, int dpitch, const BYTE *_sp, int spitch, int hblocks, int remainder, int incpitch, int height)
#endif
{
__asm mov eax, hblocks
__asm mov ebx, spitch
#ifdef MODIFYPLUGIN
__asm mov ecx, eax
#endif
__asm mov edx, remainder
#if SSE_INCREMENT == 16
__asm add eax, eax
#endif
__asm mov esi, _sp
#ifdef MODIFYPLUGIN
__asm lea eax, [eax * 8 + edx]
#else
__asm lea eax, [eax * 8 + edx + SSE_INCREMENT + 1]
#endif
__asm sub esi, ebx
__asm sub dpitch, eax
__asm neg eax
__asm mov edi, dp
#ifdef MODIFYPLUGIN
__asm inc edi
__asm lea eax, [ebx + eax]
#else
__asm lea eax, [ebx + eax + 1]
#ifdef SHARPEN
__asm mov spitch, eax
__asm mov eax, strength
#endif
__asm align 16
__asm column_loop:
__asm SSE3_MOVE SSE0, [esi + ebx + 1]
diagweightr5(SSE1, SSE2, SSE3, SSE0, [esi], [esi + 2*ebx + 2], SSE7)
diagweightr5(SSE4, SSE5, SSE6, SSE0, [esi + 2*ebx], [esi + 2], SSE7)
merge2weighted(SSE1, SSE2, SSE3, SSE4, SSE5, SSE6)
diagweightr5(SSE4, SSE5, SSE6, SSE0, [esi + 2*ebx + 1], [esi + 1], SSE7)
merge2weighted(SSE1, SSE2, SSE3, SSE4, SSE5, SSE6)
diagweightwr5(SSE4, SSE5, SSE6, SSE0, [esi + ebx + 2], [esi + ebx], [edi], SSE7)
merge2weighted(SSE1, SSE2, SSE3, SSE4, SSE5, SSE6)
#ifdef SHLUR
sharpen(SSE0, SSE1, SSE2, rshift[eax], shift_mask[eax], SSE7, SSE3)
__asm SSE_MOVE [edi + 1], SSE0
#else
__asm pmaxub SSE1, SSE0
__asm pminub SSE1, SSE2
__asm SSE_MOVE [edi + 1], SSE1
#endif
// now the pixels in the middle
__asm add esi, SSE_INCREMENT
__asm add edi, SSE_INCREMENT + 1
__asm mov ecx, hblocks
#endif // MODIFYPLUGIN
__asm align 16
__asm middle_loop:
__asm SSE3_MOVE SSE0, [esi + ebx + 1]
diagweightr5(SSE1, SSE2, SSE3, SSE0, [esi], [esi + 2*ebx + 2], SSE7)
diagweightr5(SSE4, SSE5, SSE6, SSE0, [esi + 2*ebx], [esi + 2], SSE7)
merge2weighted(SSE1, SSE2, SSE3, SSE4, SSE5, SSE6)
diagweightr5(SSE4, SSE5, SSE6, SSE0, [esi + 2*ebx + 1], [esi + 1], SSE7)
merge2weighted(SSE1, SSE2, SSE3, SSE4, SSE5, SSE6)
diagweightr5(SSE4, SSE5, SSE6, SSE0, [esi + ebx + 2], [esi + ebx], SSE7)
merge2weighted(SSE1, SSE2, SSE3, SSE4, SSE5, SSE6)
__asm add esi, SSE_INCREMENT
#ifdef MODIFYPLUGIN
#if ISSE > 1
__asm SSE3_MOVE SSE7, [edi]
#endif
#if MODIFYPLUGIN > 0
__asm pminub SSE1, SSE0
__asm pmaxub SSE2, SSE0
#endif
#if ISSE > 1
__asm pmaxub SSE1, SSE7
#else
__asm pmaxub SSE1, [edi]
#endif
__asm pminub SSE1, SSE2
__asm SSE_MOVE [edi], SSE1
#else // MODIFYPLUGIN
#ifdef SHLUR
sharpen(SSE0, SSE1, SSE2, rshift[eax], shift_mask[eax], SSE7, SSE3)
__asm SSE_MOVE [edi], SSE0
#else
__asm pmaxub SSE1, SSE0
__asm pminub SSE1, SSE2
__asm SSE_MOVE [edi], SSE1
#endif
#endif // MODIFYPLUGIN
__asm add edi, SSE_INCREMENT
__asm dec ecx
__asm jnz middle_loop
// the last pixels
__asm add esi, edx
__asm add edi, edx
__asm SSE3_MOVE SSE0, [esi + ebx + 1]
diagweightr5(SSE1, SSE2, SSE3, SSE0, [esi], [esi + 2*ebx + 2], SSE7)
diagweightr5(SSE4, SSE5, SSE6, SSE0, [esi + 2*ebx], [esi + 2], SSE7)
merge2weighted(SSE1, SSE2, SSE3, SSE4, SSE5, SSE6)
diagweightr5(SSE4, SSE5, SSE6, SSE0, [esi + 2*ebx + 1], [esi + 1], SSE7)
merge2weighted(SSE1, SSE2, SSE3, SSE4, SSE5, SSE6)
diagweightwr5(SSE4, SSE5, SSE6, SSE0, [esi + ebx], [esi + ebx + 2], [edi + 1], SSE7)
merge2weighted(SSE1, SSE2, SSE3, SSE4, SSE5, SSE6)
#ifdef MODIFYPLUGIN
#if ISSE > 1
__asm SSE3_MOVE SSE7, [edi]
#endif
#if MODIFYPLUGIN > 0
__asm pminub SSE1, SSE0
__asm pmaxub SSE2, SSE0
#endif
#if ISSE > 1
__asm pmaxub SSE1, SSE7
#else
__asm pmaxub SSE1, [edi]
#endif
__asm pminub SSE1, SSE2
__asm SSE_MOVE [edi], SSE1
#else // MODIFYPLUGIN
#ifdef SHLUR
sharpen(SSE0, SSE1, SSE2, rshift[eax], shift_mask[eax], SSE7, SSE3)
__asm SSE_MOVE [edi], SSE0
#else
__asm pmaxub SSE1, SSE0
__asm pminub SSE1, SSE2
__asm SSE_MOVE [edi], SSE1
#endif
#endif // MODIFYPLUGIN
#ifdef SHARPEN
__asm add esi, spitch
#else
__asm add esi, eax
#endif
__asm add edi, dpitch
__asm dec height
#ifdef MODIFYPLUGIN
__asm mov ecx, hblocks
__asm jnz middle_loop
#else
__asm jnz column_loop
#endif
}
#define diagweightr6(min, max, weight, center, bound1, bound2, reg) \
__asm SSE3_MOVE min, bound1 \
__asm SSE3_MOVE reg, bound2 \
__asm SSE_RMOVE max, min \
__asm SSE_RMOVE weight, center \
__asm pminub min, reg \
__asm pmaxub max, reg \
__asm SSE_RMOVE reg, min \
__asm psubusb weight, max \
__asm psubusb reg, center \
__asm pmaxub weight, reg \
__asm SSE_RMOVE reg, max \
__asm paddusb weight, weight \
__asm psubusb reg, min \
__asm paddusb weight, reg
#ifdef MODIFYPLUGIN
#define diagweightwr6(min, max, weight, center, bound1, bound2, wmem, reg) diagweightr6(min, max, weight, center, bound1, bound2, reg)
#else
// same as diagweight_5, but in addition bound2 is written to wmem
#define diagweightwr6(min, max, weight, center, bound1, bound2, wmem, reg) \
__asm SSE3_MOVE min, bound1 \
__asm SSE3_MOVE reg, bound2 \
__asm SSE_RMOVE max, min \
__asm SSE_MOVE wmem, reg \
__asm SSE_RMOVE weight, center \
__asm pminub min, reg \
__asm pmaxub max, reg \
__asm SSE_RMOVE reg, min \
__asm psubusb weight, max \
__asm psubusb reg, center \
__asm pmaxub weight, reg \
__asm SSE_RMOVE reg, max \
__asm paddusb weight, weight \
__asm psubusb reg, min \
__asm paddusb weight, reg
#endif // MODIFYPLUGIN
#ifdef SHARPEN
void SSE_Repair16(BYTE *dp, int dpitch, const BYTE *_sp, int spitch, int hblocks, int remainder, int incpitch, int height, int strength)
#else
void SSE_Repair16(BYTE *dp, int dpitch, const BYTE *_sp, int spitch, int hblocks, int remainder, int incpitch, int height)
#endif
{
__asm mov eax, hblocks
__asm mov ebx, spitch
#ifdef MODIFYPLUGIN
__asm mov ecx, eax
#endif
__asm mov edx, remainder
#if SSE_INCREMENT == 16
__asm add eax, eax
#endif
__asm mov esi, _sp
#ifdef MODIFYPLUGIN
__asm lea eax, [eax * 8 + edx]
#else
__asm lea eax, [eax * 8 + edx + SSE_INCREMENT + 1]
#endif
__asm sub esi, ebx
__asm sub dpitch, eax
__asm neg eax
__asm mov edi, dp
#ifdef MODIFYPLUGIN
__asm inc edi
__asm lea eax, [ebx + eax]
#else
__asm lea eax, [ebx + eax + 1]
#ifdef SHARPEN
__asm mov spitch, eax
__asm mov eax, strength
#endif
__asm align 16
__asm column_loop:
__asm SSE3_MOVE SSE0, [esi + ebx + 1]
diagweightr6(SSE1, SSE2, SSE3, SSE0, [esi], [esi + 2*ebx + 2], SSE7)
diagweightr6(SSE4, SSE5, SSE6, SSE0, [esi + 2*ebx], [esi + 2], SSE7)
merge2weighted(SSE1, SSE2, SSE3, SSE4, SSE5, SSE6)
diagweightr6(SSE4, SSE5, SSE6, SSE0, [esi + 2*ebx + 1], [esi + 1], SSE7)
merge2weighted(SSE1, SSE2, SSE3, SSE4, SSE5, SSE6)
diagweightwr6(SSE4, SSE5, SSE6, SSE0, [esi + ebx + 2], [esi + ebx], [edi], SSE7)
merge2weighted(SSE1, SSE2, SSE3, SSE4, SSE5, SSE6)
#ifdef SHLUR
sharpen(SSE0, SSE1, SSE2, rshift[eax], shift_mask[eax], SSE7, SSE3)
__asm SSE_MOVE [edi + 1], SSE0
#else
__asm pmaxub SSE1, SSE0
__asm pminub SSE1, SSE2
__asm SSE_MOVE [edi + 1], SSE1
#endif
// now the pixels in the middle
__asm add esi, SSE_INCREMENT
__asm add edi, SSE_INCREMENT + 1
__asm mov ecx, hblocks
#endif // MODIFYPLUGIN
__asm align 16
__asm middle_loop:
__asm SSE3_MOVE SSE0, [esi + ebx + 1]
diagweightr6(SSE1, SSE2, SSE3, SSE0, [esi], [esi + 2*ebx + 2], SSE7)
diagweightr6(SSE4, SSE5, SSE6, SSE0, [esi + 2*ebx], [esi + 2], SSE7)
merge2weighted(SSE1, SSE2, SSE3, SSE4, SSE5, SSE6)
diagweightr6(SSE4, SSE5, SSE6, SSE0, [esi + 2*ebx + 1], [esi + 1], SSE7)
merge2weighted(SSE1, SSE2, SSE3, SSE4, SSE5, SSE6)
diagweightr6(SSE4, SSE5, SSE6, SSE0, [esi + ebx + 2], [esi + ebx], SSE7)
merge2weighted(SSE1, SSE2, SSE3, SSE4, SSE5, SSE6)
__asm add esi, SSE_INCREMENT
#ifdef MODIFYPLUGIN
#if ISSE > 1
__asm SSE3_MOVE SSE7, [edi]
#endif
#if MODIFYPLUGIN > 0
__asm pminub SSE1, SSE0
__asm pmaxub SSE2, SSE0
#endif
#if ISSE > 1
__asm pmaxub SSE1, SSE7
#else
__asm pmaxub SSE1, [edi]
#endif
__asm pminub SSE1, SSE2
__asm SSE_MOVE [edi], SSE1
#else // MODIFYPLUGIN
#ifdef SHLUR
sharpen(SSE0, SSE1, SSE2, rshift[eax], shift_mask[eax], SSE7, SSE3)
__asm SSE_MOVE [edi], SSE0
#else
__asm pmaxub SSE1, SSE0
__asm pminub SSE1, SSE2
__asm SSE_MOVE [edi], SSE1
#endif
#endif // MODIFYPLUGIN
__asm add edi, SSE_INCREMENT
__asm dec ecx
__asm jnz middle_loop
// the last pixels
__asm add esi, edx
__asm add edi, edx
__asm SSE3_MOVE SSE0, [esi + ebx + 1]
diagweightr6(SSE1, SSE2, SSE3, SSE0, [esi], [esi + 2*ebx + 2], SSE7)
diagweightr6(SSE4, SSE5, SSE6, SSE0, [esi + 2*ebx], [esi + 2], SSE7)
merge2weighted(SSE1, SSE2, SSE3, SSE4, SSE5, SSE6)
diagweightr6(SSE4, SSE5, SSE6, SSE0, [esi + 2*ebx + 1], [esi + 1], SSE7)
merge2weighted(SSE1, SSE2, SSE3, SSE4, SSE5, SSE6)
diagweightwr6(SSE4, SSE5, SSE6, SSE0, [esi + ebx], [esi + ebx + 2], [edi + 1], SSE7)
merge2weighted(SSE1, SSE2, SSE3, SSE4, SSE5, SSE6)
#ifdef MODIFYPLUGIN
#if ISSE > 1
__asm SSE3_MOVE SSE7, [edi]
#endif
#if MODIFYPLUGIN > 0
__asm pminub SSE1, SSE0
__asm pmaxub SSE2, SSE0
#endif
#if ISSE > 1
__asm pmaxub SSE1, SSE7
#else
__asm pmaxub SSE1, [edi]
#endif
__asm pminub SSE1, SSE2
__asm SSE_MOVE [edi], SSE1
#else // MODIFYPLUGIN
#ifdef SHLUR
sharpen(SSE0, SSE1, SSE2, rshift[eax], shift_mask[eax], SSE7, SSE3)
__asm SSE_MOVE [edi], SSE0
#else
__asm pmaxub SSE1, SSE0
__asm pminub SSE1, SSE2
__asm SSE_MOVE [edi], SSE1
#endif
#endif // MODIFYPLUGIN
#ifdef SHARPEN
__asm add esi, spitch
#else
__asm add esi, eax
#endif
__asm add edi, dpitch
__asm dec height
#ifdef MODIFYPLUGIN
__asm mov ecx, hblocks
__asm jnz middle_loop
#else
__asm jnz column_loop
#endif
}
#endif // defined(MODIFYPLUGIN) || defined(SHLUR)
#ifdef SHLUR
#define diagweightr7(min, max, weight, center, bound1, bound2, reg) \
__asm SSE3_MOVE min, bound1 \
__asm SSE3_MOVE reg, bound2 \
__asm SSE_RMOVE max, min \
__asm SSE_RMOVE weight, center \
__asm pminub min, reg \
__asm pmaxub max, reg \
__asm SSE_RMOVE reg, min \
__asm psubusb weight, max \
__asm psubusb reg, center \
__asm pmaxub weight, reg \
__asm SSE_RMOVE reg, max \
__asm psubusb reg, min \
__asm paddusb weight, reg
#ifdef MODIFYPLUGIN
#define diagweightwr7(min, max, weight, center, bound1, bound2, wmem, reg) diagweightr7(min, max, weight, center, bound1, bound2, reg)
#else
// same as diagweight_5, but in addition bound2 is written to wmem
#define diagweightwr7(min, max, weight, center, bound1, bound2, wmem, reg) \
__asm SSE3_MOVE min, bound1 \
__asm SSE3_MOVE reg, bound2 \
__asm SSE_RMOVE max, min \
__asm SSE_MOVE wmem, reg \
__asm SSE_RMOVE weight, center \
__asm pminub min, reg \
__asm pmaxub max, reg \
__asm SSE_RMOVE reg, min \
__asm psubusb weight, max \
__asm psubusb reg, center \
__asm pmaxub weight, reg \
__asm SSE_RMOVE reg, max \
__asm psubusb reg, min \
__asm paddusb weight, reg
#endif // MODIFYPLUGIN
#ifdef SHARPEN
void SSE_Repair17(BYTE *dp, int dpitch, const BYTE *_sp, int spitch, int hblocks, int remainder, int incpitch, int height, int strength)
#else
void SSE_Repair17(BYTE *dp, int dpitch, const BYTE *_sp, int spitch, int hblocks, int remainder, int incpitch, int height)
#endif
{
__asm mov eax, hblocks
__asm mov ebx, spitch
#ifdef MODIFYPLUGIN
__asm mov ecx, eax
#endif
__asm mov edx, remainder
#if SSE_INCREMENT == 16
__asm add eax, eax
#endif
__asm mov esi, _sp
#ifdef MODIFYPLUGIN
__asm lea eax, [eax * 8 + edx]
#else
__asm lea eax, [eax * 8 + edx + SSE_INCREMENT + 1]
#endif
__asm sub esi, ebx
__asm sub dpitch, eax
__asm neg eax
__asm mov edi, dp
#ifdef MODIFYPLUGIN
__asm inc edi
__asm lea eax, [ebx + eax]
#else
__asm lea eax, [ebx + eax + 1]
#ifdef SHARPEN
__asm mov spitch, eax
__asm mov eax, strength
#endif
__asm align 16
__asm column_loop:
__asm SSE3_MOVE SSE0, [esi + ebx + 1]
diagweightr7(SSE1, SSE2, SSE3, SSE0, [esi], [esi + 2*ebx + 2], SSE7)
diagweightr7(SSE4, SSE5, SSE6, SSE0, [esi + 2*ebx], [esi + 2], SSE7)
merge2weighted(SSE1, SSE2, SSE3, SSE4, SSE5, SSE6)
diagweightr7(SSE4, SSE5, SSE6, SSE0, [esi + 2*ebx + 1], [esi + 1], SSE7)
merge2weighted(SSE1, SSE2, SSE3, SSE4, SSE5, SSE6)
diagweightwr7(SSE4, SSE5, SSE6, SSE0, [esi + ebx + 2], [esi + ebx], [edi], SSE7)
merge2weighted(SSE1, SSE2, SSE3, SSE4, SSE5, SSE6)
#ifdef SHLUR
sharpen(SSE0, SSE1, SSE2, rshift[eax], shift_mask[eax], SSE7, SSE3)
__asm SSE_MOVE [edi + 1], SSE0
#else
__asm pmaxub SSE1, SSE0
__asm pminub SSE1, SSE2
__asm SSE_MOVE [edi + 1], SSE1
#endif
// now the pixels in the middle
__asm add esi, SSE_INCREMENT
__asm add edi, SSE_INCREMENT + 1
__asm mov ecx, hblocks
#endif // MODIFYPLUGIN
__asm align 16
__asm middle_loop:
__asm SSE3_MOVE SSE0, [esi + ebx + 1]
diagweightr7(SSE1, SSE2, SSE3, SSE0, [esi], [esi + 2*ebx + 2], SSE7)
diagweightr7(SSE4, SSE5, SSE6, SSE0, [esi + 2*ebx], [esi + 2], SSE7)
merge2weighted(SSE1, SSE2, SSE3, SSE4, SSE5, SSE6)
diagweightr7(SSE4, SSE5, SSE6, SSE0, [esi + 2*ebx + 1], [esi + 1], SSE7)
merge2weighted(SSE1, SSE2, SSE3, SSE4, SSE5, SSE6)
diagweightr7(SSE4, SSE5, SSE6, SSE0, [esi + ebx + 2], [esi + ebx], SSE7)
merge2weighted(SSE1, SSE2, SSE3, SSE4, SSE5, SSE6)
__asm add esi, SSE_INCREMENT
#ifdef MODIFYPLUGIN
#if ISSE > 1
__asm SSE3_MOVE SSE7, [edi]
#endif
#if MODIFYPLUGIN > 0
__asm pminub SSE1, SSE0
__asm pmaxub SSE2, SSE0
#endif
#if ISSE > 1
__asm pmaxub SSE1, SSE7
#else
__asm pmaxub SSE1, [edi]
#endif
__asm pminub SSE1, SSE2
__asm SSE_MOVE [edi], SSE1
#else // MODIFYPLUGIN
#ifdef SHLUR
sharpen(SSE0, SSE1, SSE2, rshift[eax], shift_mask[eax], SSE7, SSE3)
__asm SSE_MOVE [edi], SSE0
#else
__asm pmaxub SSE1, SSE0
__asm pminub SSE1, SSE2
__asm SSE_MOVE [edi], SSE1
#endif
#endif // MODIFYPLUGIN
__asm add edi, SSE_INCREMENT
__asm dec ecx
__asm jnz middle_loop
// the last pixels
__asm add esi, edx
__asm add edi, edx
__asm SSE3_MOVE SSE0, [esi + ebx + 1]
diagweightr7(SSE1, SSE2, SSE3, SSE0, [esi], [esi + 2*ebx + 2], SSE7)
diagweightr7(SSE4, SSE5, SSE6, SSE0, [esi + 2*ebx], [esi + 2], SSE7)
merge2weighted(SSE1, SSE2, SSE3, SSE4, SSE5, SSE6)
diagweightr7(SSE4, SSE5, SSE6, SSE0, [esi + 2*ebx + 1], [esi + 1], SSE7)
merge2weighted(SSE1, SSE2, SSE3, SSE4, SSE5, SSE6)
diagweightwr7(SSE4, SSE5, SSE6, SSE0, [esi + ebx], [esi + ebx + 2], [edi + 1], SSE7)
merge2weighted(SSE1, SSE2, SSE3, SSE4, SSE5, SSE6)
#ifdef MODIFYPLUGIN
#if ISSE > 1
__asm SSE3_MOVE SSE7, [edi]
#endif
#if MODIFYPLUGIN > 0
__asm pminub SSE1, SSE0
__asm pmaxub SSE2, SSE0
#endif
#if ISSE > 1
__asm pmaxub SSE1, SSE7
#else
__asm pmaxub SSE1, [edi]
#endif
__asm pminub SSE1, SSE2
__asm SSE_MOVE [edi], SSE1
#else // MODIFYPLUGIN
#ifdef SHLUR
sharpen(SSE0, SSE1, SSE2, rshift[eax], shift_mask[eax], SSE7, SSE3)
__asm SSE_MOVE [edi], SSE0
#else
__asm pmaxub SSE1, SSE0
__asm pminub SSE1, SSE2
__asm SSE_MOVE [edi], SSE1
#endif
#endif // MODIFYPLUGIN
#ifdef SHARPEN
__asm add esi, spitch
#else
__asm add esi, eax
#endif
__asm add edi, dpitch
__asm dec height
#ifdef MODIFYPLUGIN
__asm mov ecx, hblocks
__asm jnz middle_loop
#else
__asm jnz column_loop
#endif
}
#define diagweightr8(min, max, weight, center, bound1, bound2, reg) \
__asm SSE3_MOVE min, bound1 \
__asm SSE3_MOVE reg, bound2 \
__asm SSE_RMOVE max, min \
__asm SSE_RMOVE weight, center \
__asm pminub min, reg \
__asm pmaxub max, reg \
__asm SSE_RMOVE reg, min \
__asm psubusb weight, max \
__asm psubusb reg, center \
__asm pmaxub weight, reg \
__asm SSE_RMOVE reg, max \
__asm psubusb reg, min \
__asm paddusb weight, reg \
__asm paddusb weight, reg
#ifdef MODIFYPLUGIN
#define diagweightwr8(min, max, weight, center, bound1, bound2, wmem, reg) diagweightr6(min, max, weight, center, bound1, bound2, reg)
#else
// same as diagweight_5, but in addition bound2 is written to wmem
#define diagweightwr8(min, max, weight, center, bound1, bound2, wmem, reg) \
__asm SSE3_MOVE min, bound1 \
__asm SSE3_MOVE reg, bound2 \
__asm SSE_RMOVE max, min \
__asm SSE_MOVE wmem, reg \
__asm SSE_RMOVE weight, center \
__asm pminub min, reg \
__asm pmaxub max, reg \
__asm SSE_RMOVE reg, min \
__asm psubusb weight, max \
__asm psubusb reg, center \
__asm pmaxub weight, reg \
__asm SSE_RMOVE reg, max \
__asm psubusb reg, min \
__asm paddusb weight, reg \
__asm paddusb weight, reg
#endif // MODIFYPLUGIN
#ifdef SHARPEN
void SSE_Repair18a(BYTE *dp, int dpitch, const BYTE *_sp, int spitch, int hblocks, int remainder, int incpitch, int height, int strength)
#else
void SSE_Repair18a(BYTE *dp, int dpitch, const BYTE *_sp, int spitch, int hblocks, int remainder, int incpitch, int height)
#endif
{
__asm mov eax, hblocks
__asm mov ebx, spitch
#ifdef MODIFYPLUGIN
__asm mov ecx, eax
#endif
__asm mov edx, remainder
#if SSE_INCREMENT == 16
__asm add eax, eax
#endif
__asm mov esi, _sp
#ifdef MODIFYPLUGIN
__asm lea eax, [eax * 8 + edx]
#else
__asm lea eax, [eax * 8 + edx + SSE_INCREMENT + 1]
#endif
__asm sub esi, ebx
__asm sub dpitch, eax
__asm neg eax
__asm mov edi, dp
#ifdef MODIFYPLUGIN
__asm inc edi
__asm lea eax, [ebx + eax]
#else
__asm lea eax, [ebx + eax + 1]
#ifdef SHARPEN
__asm mov spitch, eax
__asm mov eax, strength
#endif
__asm align 16
__asm column_loop:
__asm SSE3_MOVE SSE0, [esi + ebx + 1]
diagweightr6(SSE1, SSE2, SSE3, SSE0, [esi], [esi + 2*ebx + 2], SSE7)
diagweightr6(SSE4, SSE5, SSE6, SSE0, [esi + 2*ebx], [esi + 2], SSE7)
merge2weighted(SSE1, SSE2, SSE3, SSE4, SSE5, SSE6)
diagweightr6(SSE4, SSE5, SSE6, SSE0, [esi + 2*ebx + 1], [esi + 1], SSE7)
merge2weighted(SSE1, SSE2, SSE3, SSE4, SSE5, SSE6)
diagweightwr6(SSE4, SSE5, SSE6, SSE0, [esi + ebx + 2], [esi + ebx], [edi], SSE7)
merge2weighted(SSE1, SSE2, SSE3, SSE4, SSE5, SSE6)
#ifdef SHLUR
sharpen(SSE0, SSE1, SSE2, rshift[eax], shift_mask[eax], SSE7, SSE3)
__asm SSE_MOVE [edi + 1], SSE0
#else
__asm pmaxub SSE1, SSE0
__asm pminub SSE1, SSE2
__asm SSE_MOVE [edi + 1], SSE1
#endif
// now the pixels in the middle
__asm add esi, SSE_INCREMENT
__asm add edi, SSE_INCREMENT + 1
__asm mov ecx, hblocks
#endif // MODIFYPLUGIN
__asm align 16
__asm middle_loop:
__asm SSE3_MOVE SSE0, [esi + ebx + 1]
diagweightr6(SSE1, SSE2, SSE3, SSE0, [esi], [esi + 2*ebx + 2], SSE7)
diagweightr6(SSE4, SSE5, SSE6, SSE0, [esi + 2*ebx], [esi + 2], SSE7)
merge2weighted(SSE1, SSE2, SSE3, SSE4, SSE5, SSE6)
diagweightr6(SSE4, SSE5, SSE6, SSE0, [esi + 2*ebx + 1], [esi + 1], SSE7)
merge2weighted(SSE1, SSE2, SSE3, SSE4, SSE5, SSE6)
diagweightr6(SSE4, SSE5, SSE6, SSE0, [esi + ebx + 2], [esi + ebx], SSE7)
merge2weighted(SSE1, SSE2, SSE3, SSE4, SSE5, SSE6)
__asm add esi, SSE_INCREMENT
#ifdef MODIFYPLUGIN
#if ISSE > 1
__asm SSE3_MOVE SSE7, [edi]
#endif
#if MODIFYPLUGIN > 0
__asm pminub SSE1, SSE0
__asm pmaxub SSE2, SSE0
#endif
#if ISSE > 1
__asm pmaxub SSE1, SSE7
#else
__asm pmaxub SSE1, [edi]
#endif
__asm pminub SSE1, SSE2
__asm SSE_MOVE [edi], SSE1
#else // MODIFYPLUGIN
#ifdef SHLUR
sharpen(SSE0, SSE1, SSE2, rshift[eax], shift_mask[eax], SSE7, SSE3)
__asm SSE_MOVE [edi], SSE0
#else
__asm pmaxub SSE1, SSE0
__asm pminub SSE1, SSE2
__asm SSE_MOVE [edi], SSE1
#endif
#endif // MODIFYPLUGIN
__asm add edi, SSE_INCREMENT
__asm dec ecx
__asm jnz middle_loop
// the last pixels
__asm add esi, edx
__asm add edi, edx
__asm SSE3_MOVE SSE0, [esi + ebx + 1]
diagweightr6(SSE1, SSE2, SSE3, SSE0, [esi], [esi + 2*ebx + 2], SSE7)
diagweightr6(SSE4, SSE5, SSE6, SSE0, [esi + 2*ebx], [esi + 2], SSE7)
merge2weighted(SSE1, SSE2, SSE3, SSE4, SSE5, SSE6)
diagweightr6(SSE4, SSE5, SSE6, SSE0, [esi + 2*ebx + 1], [esi + 1], SSE7)
merge2weighted(SSE1, SSE2, SSE3, SSE4, SSE5, SSE6)
diagweightwr6(SSE4, SSE5, SSE6, SSE0, [esi + ebx], [esi + ebx + 2], [edi + 1], SSE7)
merge2weighted(SSE1, SSE2, SSE3, SSE4, SSE5, SSE6)
#ifdef MODIFYPLUGIN
#if ISSE > 1
__asm SSE3_MOVE SSE7, [edi]
#endif
#if MODIFYPLUGIN > 0
__asm pminub SSE1, SSE0
__asm pmaxub SSE2, SSE0
#endif
#if ISSE > 1
__asm pmaxub SSE1, SSE7
#else
__asm pmaxub SSE1, [edi]
#endif
__asm pminub SSE1, SSE2
__asm SSE_MOVE [edi], SSE1
#else // MODIFYPLUGIN
#ifdef SHLUR
sharpen(SSE0, SSE1, SSE2, rshift[eax], shift_mask[eax], SSE7, SSE3)
__asm SSE_MOVE [edi], SSE0
#else
__asm pmaxub SSE1, SSE0
__asm pminub SSE1, SSE2
__asm SSE_MOVE [edi], SSE1
#endif
#endif // MODIFYPLUGIN
#ifdef SHARPEN
__asm add esi, spitch
#else
__asm add esi, eax
#endif
__asm add edi, dpitch
__asm dec height
#ifdef MODIFYPLUGIN
__asm mov ecx, hblocks
__asm jnz middle_loop
#else
__asm jnz column_loop
#endif
}
#endif // SHLUR
#define diag18(val1, val2, weight, center, bound1, bound2, reg) \
__asm SSE3_MOVE val1, bound1 \
__asm SSE_RMOVE weight, center \
__asm SSE_RMOVE reg, val1 \
__asm psubusb weight, val1 \
__asm psubusb reg, center \
__asm SSE3_MOVE val2, bound2 \
__asm pmaxub weight, reg \
__asm SSE_RMOVE reg, center \
__asm psubusb reg, val2 \
__asm pmaxub weight, reg \
__asm SSE_RMOVE reg, val2 \
__asm psubusb reg, center \
__asm pmaxub weight, reg
#ifdef SHARPEN
void SSE_Repair18(BYTE *dp, int dpitch, const BYTE *_sp, int spitch, int hblocks, int remainder, int incpitch, int height, int strength)
#else
void SSE_Repair18(BYTE *dp, int dpitch, const BYTE *_sp, int spitch, int hblocks, int remainder, int incpitch, int height)
#endif
{
__asm mov eax, hblocks
__asm mov ebx, spitch
#ifdef MODIFYPLUGIN
__asm mov ecx, eax
#endif
__asm mov edx, remainder
#if SSE_INCREMENT == 16
__asm add eax, eax
#endif
__asm mov esi, _sp
#ifdef MODIFYPLUGIN
__asm lea eax, [eax * 8 + edx]
#else
__asm lea eax, [eax * 8 + edx + SSE_INCREMENT + 1]
#endif
__asm sub esi, ebx
__asm sub dpitch, eax
__asm neg eax
__asm mov edi, dp
#ifdef MODIFYPLUGIN
__asm inc edi
__asm lea eax, [ebx + eax]
#else
__asm lea eax, [ebx + eax + 1]
#ifdef SHARPEN
__asm mov spitch, eax
__asm mov eax, strength
#endif
__asm align 16
__asm column_loop:
__asm SSE3_MOVE SSE0, [esi + ebx + 1]
diag18(SSE1, SSE2, SSE3, SSE0, [esi], [esi + 2*ebx + 2], SSE7)
diag18(SSE4, SSE5, SSE6, SSE0, [esi + 2*ebx], [esi + 2], SSE7)
merge2weighted(SSE1, SSE2, SSE3, SSE4, SSE5, SSE6)
diag18(SSE4, SSE5, SSE6, SSE0, [esi + 2*ebx + 1], [esi + 1], SSE7)
merge2weighted(SSE1, SSE2, SSE3, SSE4, SSE5, SSE6)
diag18(SSE4, SSE5, SSE6, SSE0, [esi + ebx + 2], [esi + ebx], SSE7)
__asm movd [edi], SSE5
merge2weighted(SSE1, SSE2, SSE3, SSE4, SSE5, SSE6)
__asm SSE_RMOVE SSE7, SSE1
__asm pminub SSE1, SSE2
__asm pmaxub SSE7, SSE2
#ifdef SHLUR
sharpen(SSE0, SSE1, SSE7, rshift[eax], shift_mask[eax], SSE3, SSE4)
__asm SSE_MOVE [edi + 1], SSE0
#else
__asm pmaxub SSE1, SSE0
__asm pminub SSE1, SSE7
__asm SSE_MOVE [edi + 1], SSE1
#endif
// now the pixels in the middle
__asm add esi, SSE_INCREMENT
__asm add edi, SSE_INCREMENT + 1
__asm mov ecx, hblocks
#endif // MODIFYPLUGIN
__asm align 16
__asm middle_loop:
__asm SSE3_MOVE SSE0, [esi + ebx + 1]
diag18(SSE1, SSE2, SSE3, SSE0, [esi], [esi + 2*ebx + 2], SSE7)
diag18(SSE4, SSE5, SSE6, SSE0, [esi + 2*ebx], [esi + 2], SSE7)
merge2weighted(SSE1, SSE2, SSE3, SSE4, SSE5, SSE6)
diag18(SSE4, SSE5, SSE6, SSE0, [esi + 2*ebx + 1], [esi + 1], SSE7)
merge2weighted(SSE1, SSE2, SSE3, SSE4, SSE5, SSE6)
diag18(SSE4, SSE5, SSE6, SSE0, [esi + ebx + 2], [esi + ebx], SSE7)
merge2weighted(SSE1, SSE2, SSE3, SSE4, SSE5, SSE6)
__asm add esi, SSE_INCREMENT
__asm SSE_RMOVE SSE6, SSE1
#ifdef MODIFYPLUGIN
#if ISSE > 1
__asm SSE3_MOVE SSE7, [edi]
#endif
__asm pminub SSE1, SSE2
__asm pmaxub SSE6, SSE2
#if MODIFYPLUGIN > 0
__asm pminub SSE1, SSE0
__asm pmaxub SSE6, SSE0
#endif
#if ISSE > 1
__asm pmaxub SSE1, SSE7
#else
__asm pmaxub SSE1, [edi]
#endif
__asm pminub SSE1, SSE6
__asm SSE_MOVE [edi], SSE1
#else // MODIFYPLUGIN
__asm pminub SSE1, SSE2
__asm pmaxub SSE6, SSE2
#ifdef SHLUR
sharpen(SSE0, SSE1, SSE6, rshift[eax], shift_mask[eax], SSE3, SSE4)
__asm SSE_MOVE [edi], SSE0
#else
__asm pmaxub SSE1, SSE0
__asm pminub SSE1, SSE6
__asm SSE_MOVE [edi], SSE1
#endif
#endif // MODIFYPLUGIN
__asm add edi, SSE_INCREMENT
__asm dec ecx
__asm jnz middle_loop
// the last pixels
__asm add esi, edx
__asm add edi, edx
__asm SSE3_MOVE SSE0, [esi + ebx + 1]
diag18(SSE1, SSE2, SSE3, SSE0, [esi], [esi + 2*ebx + 2], SSE7)
diag18(SSE4, SSE5, SSE6, SSE0, [esi + 2*ebx], [esi + 2], SSE7)
merge2weighted(SSE1, SSE2, SSE3, SSE4, SSE5, SSE6)
diag18(SSE4, SSE5, SSE6, SSE0, [esi + 2*ebx + 1], [esi + 1], SSE7)
merge2weighted(SSE1, SSE2, SSE3, SSE4, SSE5, SSE6)
diag18(SSE4, SSE5, SSE6, SSE0, [esi + ebx], [esi + ebx + 2], SSE7)
#ifndef MODIFYPLUGIN
__asm SSE_MOVE [edi + 1], SSE5
#endif
merge2weighted(SSE1, SSE2, SSE3, SSE4, SSE5, SSE6)
__asm SSE_RMOVE SSE6, SSE1
#ifdef MODIFYPLUGIN
#if ISSE > 1
__asm SSE3_MOVE SSE7, [edi]
#endif
__asm pminub SSE1, SSE2
__asm pmaxub SSE6, SSE2
#if MODIFYPLUGIN > 0
__asm pminub SSE1, SSE0
__asm pmaxub SSE6, SSE0
#endif
#if ISSE > 1
__asm pmaxub SSE1, SSE7
#else
__asm pmaxub SSE1, [edi]
#endif
__asm pminub SSE1, SSE6
__asm SSE_MOVE [edi], SSE1
#else // MODIFYPLUGIN
__asm pminub SSE1, SSE2
__asm pmaxub SSE6, SSE2
#ifdef SHLUR
sharpen(SSE0, SSE1, SSE6, rshift[eax], shift_mask[eax], SSE3, SSE4)
__asm SSE_MOVE [edi], SSE0
#else
__asm pmaxub SSE1, SSE0
__asm pminub SSE1, SSE6
__asm SSE_MOVE [edi], SSE1
#endif
#endif // MODIFYPLUGIN
#ifdef SHARPEN
__asm add esi, spitch
#else
__asm add esi, eax
#endif
__asm add edi, dpitch
__asm dec height
#ifdef MODIFYPLUGIN
__asm mov ecx, hblocks
__asm jnz middle_loop
#else
__asm jnz column_loop
#endif
}
#ifdef SHARPEN
static void (*cleaning_methods[MAXMODE + 1])(BYTE *dp, int dpitch, const BYTE *sp, int spitch, int hblocks, int remainder, int incpitch, int height, int strength)
= { copy_plane, SSE_RemoveGrain1, SSE_RemoveGrain2, SSE_RemoveGrain3, SSE_RemoveGrain4, SSE_Repair15, SSE_Repair16, SSE_Repair17, SSE_Repair18a, diag9, copy_plane
, copy_plane, copy_plane, copy_plane, copy_plane, copy_plane, SmartRG, SSE_Repair18, copy_plane, copy_plane, copy_plane
, SmartAvgRGs, SmartAvgRGf
};
#else
static void (*cleaning_methods[MAXMODE + 1])(BYTE *dp, int dpitch, const BYTE *sp, int spitch, int hblocks, int remainder, int incpitch, int height)
#ifdef MODIFYPLUGIN
= { do_nothing, SSE_RemoveGrain1, SSE_RemoveGrain2, SSE_RemoveGrain3, SSE_RemoveGrain4, diag5, diag6, diag7, diag8, diag9
, SSE_RemoveGrain10, SSE_RemoveGrain1, SSE_Repair12, SSE_Repair13, SSE_Repair14, SSE_Repair15, SSE_Repair16, SmartRG, SSE_Repair18};
#elif defined(BLUR)
= { copy_plane, SSE_RemoveGrain1, SSE_RemoveGrain2, SSE_RemoveGrain3, SSE_RemoveGrain4, copy_plane, copy_plane, copy_plane, copy_plane, diag9, copy_plane
, copy_plane, copy_plane, copy_plane, copy_plane, copy_plane, copy_plane, copy_plane, copy_plane, copy_plane, copy_plane
, SmartAvgRGs, SmartAvgRGf
};
#else
= { copy_plane, SSE_RemoveGrain1, SSE_RemoveGrain2, SSE_RemoveGrain3, SSE_RemoveGrain4, diag5, diag6, diag7, diag8, diag9
, SSE_RemoveGrain10, SSE_RemoveGrain11, SSE_RemoveGrain12, bob_top, bob_bottom, smartbob_top, smartbob_bottom, SmartRG, SSE_Repair18, SSE_RemoveGrain19, SSE_RemoveGrain20
, SmartAvgRGs, SmartAvgRGf, SSE_RemoveGrain23, SSE_RemoveGrain24, nondestructivesharpen, SmartRGC, SmartRGCL, SmartRGCL2};
#endif
#endif // SHARPEN
class RemoveGrain : public GenericVideoFilter, public PlanarAccess
{
#ifdef MODIFYPLUGIN
PClip oclip;
#endif
int height2[3], hblocks[3], remainder[3], incpitch[3];
#ifdef SHARPEN
int strength[3];
void (*cleanf[3])(BYTE *dp, int dpitch, const BYTE *sp, int spitch, int hblocks, int remainder, int incpitch, int height, int strength);
#else
void (*cleanf[3])(BYTE *dp, int dpitch, const BYTE *sp, int spitch, int hblocks, int remainder, int incpitch, int height);
#endif
private:
PVideoFrame __stdcall GetFrame(int n, IScriptEnvironment* env)
{
PVideoFrame sf = child->GetFrame(n, env);
#ifdef MODIFYPLUGIN
PVideoFrame of = oclip->GetFrame(n, env);
#endif
PVideoFrame df = env->NewVideoFrame(vi);
int i = planes;
do
{
BYTE* dp = GetWritePtr(df, i);
int dpitch = GetPitch(df, i);
#ifdef MODIFYPLUGIN
int opitch = GetPitch(of, i);
// copy the plane from sp to dp
env->BitBlt(dp, dpitch, GetReadPtr(sf, i), GetPitch(sf, i), width[i], height[i]);
cleanf[i](dp + dpitch, dpitch, GetReadPtr(of, i) + opitch, opitch, hblocks[i], remainder[i], incpitch[i], height2[i]);
#else // MODIFYPLUGIN
const BYTE* sp = GetReadPtr(sf, i);
int spitch = GetPitch(sf, i);
// copy the first line
memcpy(dp, sp, width[i]);
dp += dpitch;
sp += spitch;
#ifdef SHARPEN
cleanf[i](dp, dpitch, sp, spitch, hblocks[i], remainder[i], incpitch[i], height2[i], strength[i]);
#else
cleanf[i](dp, dpitch, sp, spitch, hblocks[i], remainder[i], incpitch[i], height2[i]);
#endif
// copy the last line
memcpy(dp + height2[i] * dpitch, sp + height2[i] * spitch, width[i]);
#endif // MODIFYPLUGIN
} while( --i >= 0 );
SSE_EMMS
return df;
}
public:
#ifdef MODIFYPLUGIN
RemoveGrain(PClip clip, PClip _oclip, int *mode, bool planar) : GenericVideoFilter(clip), PlanarAccess(vi), oclip(_oclip)
#elif defined(SHARPEN)
RemoveGrain(PClip clip, int *mode, int *_strength, bool planar) : GenericVideoFilter(clip), PlanarAccess(vi)
#else
RemoveGrain(PClip clip, int *mode, bool planar) : GenericVideoFilter(clip), PlanarAccess(vi)
#endif
{
if( vi.IsYV12() + planar == 0 )
#ifdef MODIFYPLUGIN
AVSenvironment->ThrowError("Repair: only planar color spaces are supported");
CompareVideoInfo(vi, oclip->GetVideoInfo(), "Repair");;
oclip->SetCacheHints(CACHE_NOTHING, 0);
#else
AVSenvironment->ThrowError("RemoveGrain: only planar color spaces are supported");
#endif
child->SetCacheHints(CACHE_NOTHING, 0);
if( mode[2] < 0 )
{
planes--;
if( mode[1] < 0 ) planes--;
}
if( mode[1] < 0 ) mode[1] = 0;
int i = planes;
do
{
#ifdef SHARPEN
strength[i] = (_strength[i] > MAXSTRENGTH ? MAXSTRENGTH : _strength[i]) * SSE_INCREMENT;
#endif
if( mode[i] > MAXMODE ) AVSenvironment->ThrowError("RemoveGrain: invalid mode %u", mode[i]);
if( mode[i] < 0 ) cleanf[i] = do_nothing;
else cleanf[i] = cleaning_methods[mode[i]];
height2[i] = height[i] - 2;
incpitch[i] = (SSE_INCREMENT + 2) - width[i];
#ifdef MODIFYPLUGIN
unsigned w = width[i] - 3;
#else
unsigned w = width[i] - 3 - SSE_INCREMENT;
#endif
hblocks[i] = w / SSE_INCREMENT;
remainder[i] = (w & (SSE_INCREMENT - 1)) - (SSE_INCREMENT - 1);
//debug_printf("hblocks = %u, remainder = %i\n", hblocks[i], remainder[i]);
} while( --i >= 0 );
if( (hblocks[planes] <= 0) || (height2[planes] <= 0) )
AVSenvironment->ThrowError("RemoveGrain: the width or height of the clip is too small");
}
//~RemoveGrain(){}
};
AVSValue __cdecl CreateRemoveGrain(AVSValue args, void* user_data, IScriptEnvironment* env)
{
#ifdef MODIFYPLUGIN
enum ARGS { CLIP, OCLIP, MY, MU, MV, PLANAR};
#elif defined(SHARPEN)
enum ARGS { CLIP, MY, MU, MV, SY, SU, SV, PLANAR};
#else
enum ARGS { CLIP, MY, MU, MV, PLANAR};
#endif
int mode[3];
mode[0] = args[MY].AsInt(DEFAULT_MODE);
mode[1] = args[MU].AsInt(mode[0]);
mode[2] = args[MV].AsInt(mode[1]);
#ifdef SHARPEN
int strength[3];
strength[0] = args[SY].AsInt(DEFAULT_STRENGTH);
strength[1] = args[SU].AsInt(strength[0]);
strength[2] = args[SV].AsInt(strength[1]);
#endif
#ifdef MODIFYPLUGIN
return new RemoveGrain(args[CLIP].AsClip(), args[OCLIP].AsClip(), mode, args[PLANAR].AsBool(false));
#elif defined(SHARPEN)
return new RemoveGrain(args[CLIP].AsClip(), mode, strength, args[PLANAR].AsBool(false));
#else
return new RemoveGrain(args[CLIP].AsClip(), mode, args[PLANAR].AsBool(false));
#endif
}
#ifdef MODIFYPLUGIN
#if ISSE > 1
#define RepairPixel(dest, src1, src2, previous, next, reg1, reg2, reg3, reg4) \
__asm SSE3_MOVE reg1, next \
__asm SSE3_MOVE reg3, previous \
__asm SSE_RMOVE reg2, reg1 \
__asm SSE3_MOVE reg4, src2 \
__asm pminub reg1, reg3 \
__asm pmaxub reg2, reg3 \
__asm pminub reg1, reg4 \
__asm SSE3_MOVE reg3, src1 \
__asm pmaxub reg2, reg4 \
__asm pmaxub reg1, reg3 \
__asm pminub reg1, reg2 \
__asm SSE_MOVE dest, reg1
#else
#define RepairPixel(dest, src1, src2, previous, next, reg1, reg2, reg3, reg4) \
__asm SSE3_MOVE reg1, next \
__asm SSE3_MOVE reg3, previous \
__asm SSE_RMOVE reg2, reg1 \
__asm SSE3_MOVE reg4, src2 \
__asm pminub reg1, reg3 \
__asm pmaxub reg2, reg3 \
__asm pminub reg1, reg4 \
__asm pmaxub reg2, reg4 \
__asm pmaxub reg1, src1 \
__asm pminub reg1, reg2 \
__asm SSE_MOVE dest, reg1
#endif
static void temporal_repair(BYTE *dp, int dpitch, const BYTE *sp1, int spitch1, const BYTE *sp2, int spitch2, const BYTE *pp, int ppitch, const BYTE *np, int npitch, int width, int height)
{
int blocks = --width / SSE_INCREMENT;
int remainder = (width & (SSE_INCREMENT - 1)) - (SSE_INCREMENT - 1);
width -= SSE_INCREMENT - 1;
dpitch -= width;
spitch1 -= width;
spitch2 -= width;
ppitch -= width;
npitch -= width;
__asm mov ebx, pp
__asm mov edx, sp1
__asm mov esi, sp2
__asm mov edi, dp
__asm mov eax, np
__asm mov ecx, blocks
__asm align 16
__asm _loop:
RepairPixel([edi], [edx], [esi], [ebx], [eax], SSE0, SSE1, SSE2, SSE3)
__asm add eax, SSE_INCREMENT
__asm add esi, SSE_INCREMENT
__asm add edi, SSE_INCREMENT
__asm add edx, SSE_INCREMENT
__asm add ebx, SSE_INCREMENT
__asm loop _loop
// the last pixels
__asm add esi, remainder
__asm add edi, remainder
__asm mov ecx, blocks
__asm add ebx, remainder
__asm add eax, remainder
RepairPixel([edi], [edx], [esi], [ebx], [eax], SSE0, SSE1, SSE2, SSE3)
__asm add esi, spitch2
__asm add edi, dpitch
__asm add edx, spitch1
__asm add ebx, ppitch
__asm add eax, npitch
__asm dec height
__asm jnz _loop
}
#if ISSE > 1
#define BRepairPixel(dest, src1, src2, previous, next, reg1, reg2, reg3, reg4, reg5, reg6) \
__asm SSE3_MOVE reg1, next \
__asm SSE3_MOVE reg3, previous \
__asm SSE_RMOVE reg2, reg1 \
__asm SSE3_MOVE reg4, src2 \
__asm pmaxub reg2, reg3 \
__asm SSE_RMOVE reg5, reg4 \
__asm pminub reg1, reg3 \
__asm SSE_RMOVE reg6, reg2 \
__asm psubusb reg5, reg1 \
__asm psubusb reg6, reg4 \
__asm SSE_RMOVE reg3, reg2 \
__asm paddusb reg5, reg5 \
__asm paddusb reg6, reg6 \
__asm paddusb reg5, reg1 \
__asm psubusb reg3, reg6 \
__asm pminub reg5, reg2 \
__asm pmaxub reg3, reg1 \
__asm SSE3_MOVE reg6, src1 \
__asm pcmpeqb reg1, reg5 \
__asm pcmpeqb reg2, reg3 \
__asm pminub reg5, reg6 \
__asm pmaxub reg1, reg2 \
__asm pmaxub reg5, reg3 \
__asm pminub reg4, reg1 \
__asm psubusb reg5, reg1 \
__asm pmaxub reg4, reg5 \
__asm SSE_MOVE dest, reg4
#else
#define BRepairPixel(dest, src1, src2, previous, next, reg1, reg2, reg3, reg4, reg5, reg6) \
__asm SSE3_MOVE reg1, next \
__asm SSE3_MOVE reg3, previous \
__asm SSE_RMOVE reg2, reg1 \
__asm SSE3_MOVE reg4, src2 \
__asm pmaxub reg2, reg3 \
__asm SSE_RMOVE reg5, reg4 \
__asm pminub reg1, reg3 \
__asm SSE_RMOVE reg6, reg2 \
__asm psubusb reg5, reg1 \
__asm psubusb reg6, reg4 \
__asm SSE_RMOVE reg3, reg2 \
__asm paddusb reg5, reg5 \
__asm paddusb reg6, reg6 \
__asm paddusb reg5, reg1 \
__asm psubusb reg3, reg6 \
__asm pminub reg5, reg2 \
__asm pmaxub reg3, reg1 \
__asm pcmpeqb reg1, reg5 \
__asm pcmpeqb reg2, reg3 \
__asm pminub reg5, src1 \
__asm pmaxub reg1, reg2 \
__asm pmaxub reg5, reg3 \
__asm pminub reg4, reg1 \
__asm psubusb reg5, reg1 \
__asm pmaxub reg4, reg5 \
__asm SSE_MOVE dest, reg4
#endif // ISSE > 1
static void btemporal_repair(BYTE *dp, int dpitch, const BYTE *sp1, int spitch1, const BYTE *sp2, int spitch2, const BYTE *pp, int ppitch, const BYTE *np, int npitch, int width, int height)
{
int blocks = --width / SSE_INCREMENT;
int remainder = (width & (SSE_INCREMENT - 1)) - (SSE_INCREMENT - 1);
width -= SSE_INCREMENT - 1;
dpitch -= width;
spitch1 -= width;
spitch2 -= width;
ppitch -= width;
npitch -= width;
__asm mov ebx, pp
__asm mov edx, sp1
__asm mov esi, sp2
__asm mov edi, dp
__asm mov eax, np
__asm mov ecx, blocks
__asm align 16
__asm _loop:
BRepairPixel([edi], [edx], [esi], [ebx], [eax], SSE0, SSE1, SSE2, SSE3, SSE4, SSE5)
__asm add eax, SSE_INCREMENT
__asm add esi, SSE_INCREMENT
__asm add edi, SSE_INCREMENT
__asm add edx, SSE_INCREMENT
__asm add ebx, SSE_INCREMENT
__asm loop _loop
// the last pixels
__asm add esi, remainder
__asm add edi, remainder
__asm mov ecx, blocks
__asm add ebx, remainder
__asm add eax, remainder
BRepairPixel([edi], [edx], [esi], [ebx], [eax], SSE0, SSE1, SSE2, SSE3, SSE4, SSE5)
__asm add esi, spitch2
__asm add edi, dpitch
__asm add edx, spitch1
__asm add ebx, ppitch
__asm add eax, npitch
__asm dec height
__asm jnz _loop
}
class TemporalRepair : public GenericVideoFilter, public PlanarAccess
{
void (*trepair)(BYTE *dp, int dpitch, const BYTE *sp1, int spitch1, const BYTE *sp2, int spitch2, const BYTE *pp, int ppitch, const BYTE *np, int npitch, int width, int height);
unsigned last_frame;
PClip orig;
PVideoFrame __stdcall GetFrame(int n, IScriptEnvironment* env)
{
if( ((unsigned)(n - 1) >= last_frame) ) return child->GetFrame(n, env);
PVideoFrame pf = orig->GetFrame(n - 1, env);
PVideoFrame sf = orig->GetFrame(n, env);
PVideoFrame nf = orig->GetFrame(n + 1, env);
PVideoFrame cf = child->GetFrame(n, env);
PVideoFrame df = env->NewVideoFrame(vi);
int i = planes;
do
{
trepair(GetWritePtr(df, i), GetPitch(df, i), GetReadPtr(cf, i), GetPitch(cf, i), GetReadPtr(sf, i), GetPitch(sf, i), GetReadPtr(pf, i), GetPitch(pf, i), GetReadPtr(nf, i), GetPitch(nf, i), width[i], height[i]);
} while( --i >= 0 );
SSE_EMMS
return df;
}
public:
TemporalRepair(PClip clip, PClip oclip, int mode, bool grey, bool planar) : GenericVideoFilter(clip), PlanarAccess(vi, planar && grey), orig(oclip)
{
CompareVideoInfo(vi, orig->GetVideoInfo(), "TemporalRepair");
child->SetCacheHints(CACHE_RANGE, 0);
orig->SetCacheHints(CACHE_RANGE, 2);
trepair = mode ? btemporal_repair : temporal_repair;
last_frame = vi.num_frames - 2;
if( (int) last_frame < 0 ) last_frame = 0;
if( grey ) planes = 0;
}
//~TemporalRepair(){}
};
#define get_lu(lower, upper, previous, current, next, reg1, reg2) \
__asm SSE3_MOVE upper, next \
__asm SSE3_MOVE reg1, previous \
__asm SSE_RMOVE reg2, upper \
__asm SSE3_MOVE lower, current \
__asm pmaxub upper, reg1 \
__asm pminub reg2, reg1 \
__asm psubusb upper, lower \
__asm psubusb lower, reg2
#if ISSE > 1
#define SmoothTRepair(dest, lower, upper, previous, current, next, reg1, reg2) \
__asm SSE3_MOVE reg1, current \
__asm SSE3_MOVE reg2, previous \
__asm paddusb upper, reg1 \
__asm psubusb reg1, lower \
__asm pmaxub upper, reg2 \
__asm SSE3_MOVE lower, next \
__asm pminub reg1, reg2 \
__asm pmaxub upper, lower \
__asm SSE3_MOVE reg2, dest \
__asm pminub reg1, lower \
__asm pminub upper, reg2 \
__asm pmaxub upper, reg1 \
__asm SSE_MOVE dest, upper
#else
#define SmoothTRepair(dest, lower, upper, previous, current, next, reg1, reg2) \
__asm SSE3_MOVE reg1, current \
__asm SSE3_MOVE reg2, previous \
__asm paddusb upper, reg1 \
__asm psubusb reg1, lower \
__asm pmaxub upper, reg2 \
__asm SSE3_MOVE lower, next \
__asm pminub reg1, reg2 \
__asm pmaxub upper, lower \
__asm pminub reg1, lower \
__asm pminub upper, dest \
__asm pmaxub upper, reg1 \
__asm SSE_MOVE dest, upper
#endif
void smooth_temporal_repair1(BYTE *dp, const BYTE *previous, const BYTE *_sp, const BYTE *next, int pitch, int hblocks, int height, int remainder)
{
__asm mov eax, hblocks
__asm mov ecx, eax
__asm mov edx, previous
__asm mov esi, _sp
__asm shl eax, SSE_SHIFT
__asm mov edi, dp
__asm add eax, remainder
__asm mov ebx, pitch
__asm sub pitch, eax
__asm lea edi, [edi + ebx + 1]
__asm mov eax, next
__asm align 16
__asm middle_loop:
get_lu(SSE0, SSE1, [edx], [esi], [eax], SSE6, SSE7)
get_lu(SSE2, SSE3, [edx + 1], [esi + 1], [eax + 1], SSE6, SSE7)
__asm pmaxub SSE1, SSE3
__asm pmaxub SSE0, SSE2
get_lu(SSE2, SSE3, [edx + 2], [esi + 2], [eax + 2], SSE6, SSE7)
__asm pmaxub SSE1, SSE3
__asm pmaxub SSE0, SSE2
get_lu(SSE2, SSE3, [edx + 2*ebx], [esi + 2*ebx], [eax + 2*ebx], SSE6, SSE7)
__asm pmaxub SSE1, SSE3
__asm pmaxub SSE0, SSE2
get_lu(SSE2, SSE3, [edx + 2*ebx + 1], [esi + 2*ebx + 1], [eax + 2*ebx + 1], SSE6, SSE7)
__asm pmaxub SSE1, SSE3
__asm pmaxub SSE0, SSE2
get_lu(SSE2, SSE3, [edx + 2*ebx + 2], [esi + 2*ebx + 2], [eax + 2*ebx + 2], SSE6, SSE7)
__asm pmaxub SSE1, SSE3
__asm pmaxub SSE0, SSE2
get_lu(SSE2, SSE3, [edx + ebx], [esi + ebx], [eax + ebx], SSE6, SSE7)
__asm pmaxub SSE1, SSE3
__asm pmaxub SSE0, SSE2
get_lu(SSE2, SSE3, [edx + ebx + 2], [esi + ebx + 2], [eax + ebx + 2], SSE6, SSE7)
__asm pmaxub SSE1, SSE3
__asm pmaxub SSE0, SSE2
SmoothTRepair([edi], SSE0, SSE1, [edx + ebx + 1], [esi + ebx + 1], [eax + ebx + 1], SSE6, SSE7)
__asm add esi, SSE_INCREMENT
__asm add edx, SSE_INCREMENT
__asm add eax, SSE_INCREMENT
__asm add edi, SSE_INCREMENT
__asm dec ecx
__asm jnz middle_loop
// the last pixels
__asm add esi, remainder
__asm add edx, remainder
__asm add eax, remainder
__asm add edi, remainder
get_lu(SSE0, SSE1, [edx], [esi], [eax], SSE6, SSE7)
get_lu(SSE2, SSE3, [edx + 1], [esi + 1], [eax + 1], SSE6, SSE7)
__asm pmaxub SSE1, SSE3
__asm pmaxub SSE0, SSE2
get_lu(SSE2, SSE3, [edx + 2], [esi + 2], [eax + 2], SSE6, SSE7)
__asm pmaxub SSE1, SSE3
__asm pmaxub SSE0, SSE2
get_lu(SSE2, SSE3, [edx + 2*ebx], [esi + 2*ebx], [eax + 2*ebx], SSE6, SSE7)
__asm pmaxub SSE1, SSE3
__asm pmaxub SSE0, SSE2
get_lu(SSE2, SSE3, [edx + 2*ebx + 1], [esi + 2*ebx + 1], [eax + 2*ebx + 1], SSE6, SSE7)
__asm pmaxub SSE1, SSE3
__asm pmaxub SSE0, SSE2
get_lu(SSE2, SSE3, [edx + 2*ebx + 2], [esi + 2*ebx + 2], [eax + 2*ebx + 2], SSE6, SSE7)
__asm pmaxub SSE1, SSE3
__asm pmaxub SSE0, SSE2
get_lu(SSE2, SSE3, [edx + ebx], [esi + ebx], [eax + ebx], SSE6, SSE7)
__asm pmaxub SSE1, SSE3
__asm pmaxub SSE0, SSE2
get_lu(SSE2, SSE3, [edx + ebx + 2], [esi + ebx + 2], [eax + ebx + 2], SSE6, SSE7)
__asm pmaxub SSE1, SSE3
__asm pmaxub SSE0, SSE2
SmoothTRepair([edi], SSE0, SSE1, [edx + ebx + 1], [esi + ebx + 1], [eax + ebx + 1], SSE6, SSE7)
__asm add esi, pitch
__asm add edx, pitch
__asm add eax, pitch
__asm add edi, pitch
__asm dec height
__asm mov ecx, hblocks
__asm jnz middle_loop
}
#ifdef SMOOTH2
#define get_lu_reg(lower, upper, previous, current, next, reg1, reg2) \
__asm SSE3_MOVE upper, next \
__asm SSE3_MOVE reg1, previous \
__asm SSE_RMOVE reg2, upper \
__asm SSE_RMOVE lower, current \
__asm pmaxub upper, reg1 \
__asm pminub reg2, reg1 \
__asm psubusb upper, lower \
__asm psubusb lower, reg2
#if ISSE > 1
#define SmoothTRepair2(dest, lower, upper, previous, current, next, reg1, reg2, reg3, reg4, reg5) \
__asm SSE3_MOVE reg1, current \
get_lu_reg(reg4, reg5, previous, reg1, next, reg2, reg3) \
__asm pmaxub upper, reg5 \
__asm pmaxub lower, reg4 \
__asm SSE_RMOVE reg2, reg1 \
__asm pmaxub upper, lower \
__asm SSE3_MOVE reg3, dest \
__asm paddusb reg1, upper \
__asm psubusb reg2, upper \
__asm pminub reg1, reg3 \
__asm pmaxub reg1, reg2 \
__asm SSE_MOVE dest, reg1
#else
#define SmoothTRepair2(dest, lower, upper, previous, current, next, reg1, reg2, reg3, reg4, reg5) \
__asm SSE3_MOVE reg1, current \
get_lu_reg(reg4, reg5, previous, reg1, next, reg2, reg3) \
__asm pmaxub upper, reg5 \
__asm pmaxub lower, reg4 \
__asm SSE_RMOVE reg2, reg1 \
__asm pmaxub upper, lower \
__asm paddusb reg1, upper \
__asm psubusb reg2, upper \
__asm pminub reg1, dest \
__asm pmaxub reg1, reg2 \
__asm SSE_MOVE dest, reg1
#endif
void smooth_temporal_repair2(BYTE *dp, const BYTE *previous, const BYTE *_sp, const BYTE *next, int pitch, int hblocks, int height, int remainder)
{
__asm mov eax, hblocks
__asm mov ecx, eax
__asm mov edx, previous
__asm mov esi, _sp
__asm shl eax, SSE_SHIFT
__asm mov edi, dp
__asm add eax, remainder
__asm mov ebx, pitch
__asm sub pitch, eax
__asm lea edi, [edi + ebx + 1]
__asm mov eax, next
__asm align 16
__asm middle_loop:
get_lu(SSE0, SSE1, [edx], [esi], [eax], SSE6, SSE7)
get_lu(SSE2, SSE3, [edx + 1], [esi + 1], [eax + 1], SSE6, SSE7)
__asm pmaxub SSE1, SSE3
__asm pmaxub SSE0, SSE2
get_lu(SSE2, SSE3, [edx + 2], [esi + 2], [eax + 2], SSE6, SSE7)
__asm pmaxub SSE1, SSE3
__asm pmaxub SSE0, SSE2
get_lu(SSE2, SSE3, [edx + 2*ebx], [esi + 2*ebx], [eax + 2*ebx], SSE6, SSE7)
__asm pmaxub SSE1, SSE3
__asm pmaxub SSE0, SSE2
get_lu(SSE2, SSE3, [edx + 2*ebx + 1], [esi + 2*ebx + 1], [eax + 2*ebx + 1], SSE6, SSE7)
__asm pmaxub SSE1, SSE3
__asm pmaxub SSE0, SSE2
get_lu(SSE2, SSE3, [edx + 2*ebx + 2], [esi + 2*ebx + 2], [eax + 2*ebx + 2], SSE6, SSE7)
__asm pmaxub SSE1, SSE3
__asm pmaxub SSE0, SSE2
get_lu(SSE2, SSE3, [edx + ebx], [esi + ebx], [eax + ebx], SSE6, SSE7)
__asm pmaxub SSE1, SSE3
__asm pmaxub SSE0, SSE2
get_lu(SSE2, SSE3, [edx + ebx + 2], [esi + ebx + 2], [eax + ebx + 2], SSE6, SSE7)
__asm pmaxub SSE1, SSE3
__asm pmaxub SSE0, SSE2
SmoothTRepair2([edi], SSE0, SSE1, [edx + ebx + 1], [esi + ebx + 1], [eax + ebx + 1], SSE4, SSE5, SSE6, SSE7, SSE3)
__asm add esi, SSE_INCREMENT
__asm add edx, SSE_INCREMENT
__asm add eax, SSE_INCREMENT
__asm add edi, SSE_INCREMENT
__asm dec ecx
__asm jnz middle_loop
// the last pixels
__asm add esi, remainder
__asm add edx, remainder
__asm add eax, remainder
__asm add edi, remainder
get_lu(SSE0, SSE1, [edx], [esi], [eax], SSE6, SSE7)
get_lu(SSE2, SSE3, [edx + 1], [esi + 1], [eax + 1], SSE6, SSE7)
__asm pmaxub SSE1, SSE3
__asm pmaxub SSE0, SSE2
get_lu(SSE2, SSE3, [edx + 2], [esi + 2], [eax + 2], SSE6, SSE7)
__asm pmaxub SSE1, SSE3
__asm pmaxub SSE0, SSE2
get_lu(SSE2, SSE3, [edx + 2*ebx], [esi + 2*ebx], [eax + 2*ebx], SSE6, SSE7)
__asm pmaxub SSE1, SSE3
__asm pmaxub SSE0, SSE2
get_lu(SSE2, SSE3, [edx + 2*ebx + 1], [esi + 2*ebx + 1], [eax + 2*ebx + 1], SSE6, SSE7)
__asm pmaxub SSE1, SSE3
__asm pmaxub SSE0, SSE2
get_lu(SSE2, SSE3, [edx + 2*ebx + 2], [esi + 2*ebx + 2], [eax + 2*ebx + 2], SSE6, SSE7)
__asm pmaxub SSE1, SSE3
__asm pmaxub SSE0, SSE2
get_lu(SSE2, SSE3, [edx + ebx], [esi + ebx], [eax + ebx], SSE6, SSE7)
__asm pmaxub SSE1, SSE3
__asm pmaxub SSE0, SSE2
get_lu(SSE2, SSE3, [edx + ebx + 2], [esi + ebx + 2], [eax + ebx + 2], SSE6, SSE7)
__asm pmaxub SSE1, SSE3
__asm pmaxub SSE0, SSE2
SmoothTRepair2([edi], SSE0, SSE1, [edx + ebx + 1], [esi + ebx + 1], [eax + ebx + 1], SSE4, SSE5, SSE6, SSE7, SSE3)
__asm add esi, pitch
__asm add edx, pitch
__asm add eax, pitch
__asm add edi, pitch
__asm dec height
__asm mov ecx, hblocks
__asm jnz middle_loop
}
#define get2diff(pdiff, ndiff, previous, current, next, reg1, reg2, reg3) \
__asm SSE3_MOVE reg3, current \
__asm SSE3_MOVE pdiff, previous \
__asm SSE_RMOVE reg1, reg3 \
__asm SSE3_MOVE ndiff, next \
__asm SSE_RMOVE reg2, reg3 \
__asm psubusb reg1, pdiff \
__asm psubusb reg2, ndiff \
__asm psubusb ndiff, reg3 \
__asm psubusb pdiff, reg3 \
__asm pmaxub pdiff, reg1 \
__asm pmaxub ndiff, reg2
#if ISSE > 1
#define SmoothTRepair3(dest, pmax, nmax, previous, current, next, reg1, reg2, reg3, reg4, reg5) \
get2diff(reg4, reg5, previous, current, next, reg2, reg3, reg1) \
__asm pmaxub pmax, reg4 \
__asm pmaxub nmax, reg5 \
__asm SSE_RMOVE reg2, reg1 \
__asm pminub pmax, nmax \
__asm SSE3_MOVE reg3, dest \
__asm paddusb reg1, pmax \
__asm psubusb reg2, pmax \
__asm pminub reg1, reg3 \
__asm pmaxub reg1, reg2 \
__asm SSE_MOVE dest, reg1
#else
#define SmoothTRepair3(dest, pmax, nmax, previous, current, next, reg1, reg2, reg3, reg4, reg5) \
get2diff(reg4, reg5, previous, current, next, reg2, reg3, reg1) \
__asm pmaxub pmax, reg4 \
__asm pmaxub nmax, reg5 \
__asm SSE_RMOVE reg2, reg1 \
__asm pminub pmax, nmax \
__asm paddusb reg1, pmax \
__asm psubusb reg2, pmax \
__asm pminub reg1, dest \
__asm pmaxub reg1, reg2 \
__asm SSE_MOVE dest, reg1
#endif
void smooth_temporal_repair3(BYTE *dp, const BYTE *previous, const BYTE *_sp, const BYTE *next, int pitch, int hblocks, int height, int remainder)
{
__asm mov eax, hblocks
__asm mov ecx, eax
__asm mov edx, previous
__asm mov esi, _sp
__asm shl eax, SSE_SHIFT
__asm mov edi, dp
__asm add eax, remainder
__asm mov ebx, pitch
__asm sub pitch, eax
__asm lea edi, [edi + ebx + 1]
__asm mov eax, next
__asm align 16
__asm middle_loop:
get2diff(SSE0, SSE1, [edx], [esi], [eax], SSE5, SSE6, SSE7)
get2diff(SSE2, SSE3, [edx + 1], [esi + 1], [eax + 1], SSE5, SSE6, SSE7)
__asm pmaxub SSE0, SSE2
__asm pmaxub SSE1, SSE3
get2diff(SSE2, SSE3, [edx + 2], [esi + 2], [eax + 2], SSE5, SSE6, SSE7)
__asm pmaxub SSE0, SSE2
__asm pmaxub SSE1, SSE3
get2diff(SSE2, SSE3, [edx + 2*ebx], [esi + 2*ebx], [eax + 2*ebx], SSE5, SSE6, SSE7)
__asm pmaxub SSE0, SSE2
__asm pmaxub SSE1, SSE3
get2diff(SSE2, SSE3, [edx + 2*ebx + 1], [esi + 2*ebx + 1], [eax + 2*ebx + 1], SSE5, SSE6, SSE7)
__asm pmaxub SSE0, SSE2
__asm pmaxub SSE1, SSE3
get2diff(SSE2, SSE3, [edx + 2*ebx + 2], [esi + 2*ebx + 2], [eax + 2*ebx + 2], SSE5, SSE6, SSE7)
__asm pmaxub SSE1, SSE3
get2diff(SSE2, SSE3, [edx + ebx], [esi + ebx], [eax + ebx], SSE5, SSE6, SSE7)
__asm pmaxub SSE0, SSE2
__asm pmaxub SSE1, SSE3
get2diff(SSE2, SSE3, [edx + ebx + 2], [esi + ebx + 2], [eax + ebx + 2], SSE5, SSE6, SSE7)
__asm pmaxub SSE0, SSE2
__asm pmaxub SSE1, SSE3
SmoothTRepair3([edi], SSE0, SSE1, [edx + ebx + 1], [esi + ebx + 1], [eax + ebx + 1], SSE4, SSE5, SSE6, SSE7, SSE3)
__asm add esi, SSE_INCREMENT
__asm add edx, SSE_INCREMENT
__asm add eax, SSE_INCREMENT
__asm add edi, SSE_INCREMENT
__asm dec ecx
__asm jnz middle_loop
// the last pixels
__asm add esi, remainder
__asm add edx, remainder
__asm add eax, remainder
__asm add edi, remainder
get2diff(SSE0, SSE1, [edx], [esi], [eax], SSE5, SSE6, SSE7)
get2diff(SSE2, SSE3, [edx + 1], [esi + 1], [eax + 1], SSE5, SSE6, SSE7)
__asm pmaxub SSE0, SSE2
__asm pmaxub SSE1, SSE3
get2diff(SSE2, SSE3, [edx + 2], [esi + 2], [eax + 2], SSE5, SSE6, SSE7)
__asm pmaxub SSE0, SSE2
__asm pmaxub SSE1, SSE3
get2diff(SSE2, SSE3, [edx + 2*ebx], [esi + 2*ebx], [eax + 2*ebx], SSE5, SSE6, SSE7)
__asm pmaxub SSE0, SSE2
__asm pmaxub SSE1, SSE3
get2diff(SSE2, SSE3, [edx + 2*ebx + 1], [esi + 2*ebx + 1], [eax + 2*ebx + 1], SSE5, SSE6, SSE7)
__asm pmaxub SSE0, SSE2
__asm pmaxub SSE1, SSE3
get2diff(SSE2, SSE3, [edx + 2*ebx + 2], [esi + 2*ebx + 2], [eax + 2*ebx + 2], SSE5, SSE6, SSE7)
__asm pmaxub SSE0, SSE2
__asm pmaxub SSE1, SSE3
get2diff(SSE2, SSE3, [edx + ebx], [esi + ebx], [eax + ebx], SSE5, SSE6, SSE7)
__asm pmaxub SSE0, SSE2
__asm pmaxub SSE1, SSE3
get2diff(SSE2, SSE3, [edx + ebx + 2], [esi + ebx + 2], [eax + ebx + 2], SSE5, SSE6, SSE7)
__asm pmaxub SSE0, SSE2
__asm pmaxub SSE1, SSE3
SmoothTRepair3([edi], SSE0, SSE1, [edx + ebx + 1], [esi + ebx + 1], [eax + ebx + 1], SSE4, SSE5, SSE6, SSE7, SSE3)
__asm add esi, pitch
__asm add edx, pitch
__asm add eax, pitch
__asm add edi, pitch
__asm dec height
__asm mov ecx, hblocks
__asm jnz middle_loop
}
#endif // SMOOTH2
class SmoothTemporalRepair : public GenericVideoFilter, public PlanarAccess
{
HomogeneousChild oclip;
#ifdef SMOOTH2
void (*st_repair)(BYTE *dp, const BYTE *previous, const BYTE *_sp, const BYTE *next,int pitch, int hblocks, int height, int remainder);
#else
#define st_repair smooth_temporal_repair1
#endif
int height2[3], hblocks[3], remainder[3];
unsigned last_frame;
PVideoFrame __stdcall GetFrame(int n, IScriptEnvironment* env)
{
if( ((unsigned)(n - 1) >= last_frame) ) return child->GetFrame(n, env);
PVideoFrame sf = child->GetFrame(n, env);
PVideoFrame pf = oclip->GetFrame(n - 1, env);
PVideoFrame of = oclip->GetFrame(n, env);
PVideoFrame nf = oclip->GetFrame(n + 1, env);
PVideoFrame df = env->NewVideoFrame(vi);
int i = planes;
do
{
BYTE* dp = GetWritePtr(df,i);
int pitch = GetPitch(df, i);
// copy the plane from sp to dp
env->BitBlt(dp, pitch, GetReadPtr(sf, i), pitch, width[i], height[i]);
st_repair(dp, GetReadPtr(pf, i), GetReadPtr(of, i), GetReadPtr(nf, i), pitch, hblocks[i], height2[i], remainder[i]);
} while( --i >= 0 );
SSE_EMMS
return df;
}
public:
SmoothTemporalRepair(PClip clip, PClip _oclip, int mode, bool grey, bool planar, IScriptEnvironment* env) : GenericVideoFilter(clip), PlanarAccess(vi), oclip(_oclip, grey, env)
{
if( vi.IsYV12() + planar == 0 ) AVSenvironment->ThrowError("TemporalRepair: only planar color spaces are supported");
CompareVideoInfo(vi, _oclip->GetVideoInfo(), "TemporalRepair");
_oclip->SetCacheHints(CACHE_RANGE, 2);
child->SetCacheHints(CACHE_NOTHING, 0);
#ifdef SMOOTH2
switch( mode )
{
case 1 :
st_repair = smooth_temporal_repair1;
break;
case 2 :
st_repair = smooth_temporal_repair2;
break;
default :
st_repair = smooth_temporal_repair3;
}
#endif // SMOOTH2
if( grey ) planes = 0;
last_frame = vi.num_frames - 2;
if( (int) last_frame < 0 ) last_frame = 0;
int i = planes;
do
{
height2[i] = height[i] - 2;
// unsigned w = width[i] - 1 - 2*smooth;
unsigned w = width[i] - 3;
hblocks[i] = w / SSE_INCREMENT;
remainder[i] = (w & (SSE_INCREMENT - 1)) - (SSE_INCREMENT - 1);
} while( --i >= 0 );
if( (hblocks[planes] <= 0) || (height2[planes] <= 0) )
AVSenvironment->ThrowError("TemporalRepair: the width or height of the clip is too small");
}
//~SmoothTemporalRepair(){}
};
#define MAXTMODE 4
bool spatial[MAXTMODE + 1] = {false, true, true, true, false };
AVSValue __cdecl CreateTemporalRepair(AVSValue args, void* user_data, IScriptEnvironment* env)
{
enum ARGS { CLIP, OCLIP, MODE, SMOOTH, GREY, PLANAR };
PClip clip = args[CLIP].AsClip();
PClip oclip = args[OCLIP].AsClip();
bool grey = args[GREY].AsBool(false);
int mode = args[MODE].AsInt(args[SMOOTH].AsInt(0));
if( (unsigned) mode > MAXTMODE ) env->ThrowError("TemporalRepair: illegal mode %i", mode);
bool planar = args[PLANAR].AsBool(false);
return spatial[mode] ? (AVSValue) new SmoothTemporalRepair(clip, oclip, mode, grey, planar, env)
: (AVSValue) new TemporalRepair(clip, oclip, mode, grey, planar);
};
#else // MODIFYPLUGIN
class GenericClense : public GenericVideoFilter, public PlanarAccess
{
protected:
int hblocks[3];
int remainder[3];
int incpitch[3];
public:
GenericClense(PClip clip, bool grey, bool planar);
};
GenericClense::GenericClense(PClip clip, bool grey, bool planar) : GenericVideoFilter(clip), PlanarAccess(vi, planar && grey)
{
if( grey ) planes = 0;
int i = planes;
do
{
int w = width[i];
hblocks[i] = --w / (2*SSE_INCREMENT);
remainder[i] = (w & (2*SSE_INCREMENT - 1)) - (2*SSE_INCREMENT - 1);
incpitch[i] = 2*SSE_INCREMENT - width[i] + remainder[i];
} while( --i >= 0 );
}
#if SHARPEN == 1
// only sharpen
#define simplesharpen(center, min, max, reg1, reg2)\
__asm SSE_RMOVE reg2, center \
__asm psubusb max, center \
__asm psubusb reg2, min \
__asm SSE_RMOVE reg1, max \
__asm SSE_RMOVE min, reg2 \
__asm psubusb max, reg2 \
__asm psubusb min, reg1 \
__asm pminub reg2, max \
__asm pminub reg1, min \
__asm psubusb center, reg2 \
__asm paddusb center, reg1
#elif SHARPEN == 2
#define simplesharpen(center, min, max, reg1, reg2)\
__asm pminub center, max \
__asm pmaxub center, min \
__asm SSE_RMOVE reg2, center \
__asm psubusb max, center \
__asm psubusb reg2, min \
__asm SSE_RMOVE reg1, max \
__asm SSE_RMOVE min, reg2 \
__asm psubusb max, reg2 \
__asm psubusb min, reg1 \
__asm pminub reg2, max \
__asm pminub reg1, min \
__asm psubusb center, reg2 \
__asm paddusb center, reg1
#endif // SHARPEN == 2
#if ISSE > 1
static inline void aligned_clense(BYTE *dp, int dpitch, const BYTE *_sp, int spitch, const BYTE *pp, int ppitch, const BYTE *np, int npitch, int hblocks, int remainder, int incpitch, int height)
#else
static void clense(BYTE *dp, int dpitch, const BYTE *_sp, int spitch, const BYTE *pp, int ppitch, const BYTE *np, int npitch, int hblocks, int remainder, int incpitch, int height)
#endif
#ifdef SHARPEN
#define AClensePixel(daddr, saddr, paddr, naddr, reg1, reg2, reg3, reg4, reg5, reg6, reg7, reg8) \
__asm SSE_RMOVE reg1, [naddr] \
__asm SSE_RMOVE reg2, [naddr + SSE_INCREMENT] \
__asm SSE_RMOVE reg3, reg1 \
__asm SSE_RMOVE reg5, [paddr] \
__asm SSE_RMOVE reg4, reg2 \
__asm SSE_RMOVE reg6, [paddr + SSE_INCREMENT] \
__asm pminub reg1, reg5 \
__asm pminub reg2, reg6 \
__asm SSE_RMOVE reg7, [saddr] \
__asm pmaxub reg3, reg5 \
__asm pmaxub reg4, reg6 \
__asm SSE_RMOVE reg8, [saddr + SSE_INCREMENT] \
simplesharpen(reg7, reg1, reg3, reg5, reg6) \
simplesharpen(reg8, reg2, reg4, reg5, reg6) \
__asm SSE_RMOVE [daddr], reg7 \
__asm SSE_RMOVE [daddr + SSE_INCREMENT], reg8
#define UClensePixel(daddr, saddr, paddr, naddr, reg1, reg2, reg3, reg4, reg5, reg6, reg7, reg8) \
__asm SSE3_MOVE reg1, [naddr] \
__asm SSE3_MOVE reg2, [naddr + SSE_INCREMENT] \
__asm SSE_RMOVE reg3, reg1 \
__asm SSE3_MOVE reg5, [paddr] \
__asm SSE_RMOVE reg4, reg2 \
__asm SSE3_MOVE reg6, [paddr + SSE_INCREMENT] \
__asm pminub reg1, reg5 \
__asm pminub reg2, reg6 \
__asm SSE3_MOVE reg7, [saddr] \
__asm pmaxub reg3, reg5 \
__asm pmaxub reg4, reg6 \
__asm SSE3_MOVE reg8, [saddr + SSE_INCREMENT] \
simplesharpen(reg7, reg1, reg3, reg5, reg6) \
simplesharpen(reg8, reg2, reg4, reg5, reg6) \
__asm SSE_MOVE [daddr], reg7 \
__asm SSE_MOVE [daddr + SSE_INCREMENT], reg8
#else // SHARPEN
#define AClensePixel(daddr, saddr, paddr, naddr, reg1, reg2, reg3, reg4, reg5, reg6, reg7, reg8) \
__asm SSE_RMOVE reg1, [naddr] \
__asm SSE_RMOVE reg2, [naddr + SSE_INCREMENT] \
__asm SSE_RMOVE reg3, reg1 \
__asm SSE_RMOVE reg5, [paddr] \
__asm SSE_RMOVE reg4, reg2 \
__asm SSE_RMOVE reg6, [paddr + SSE_INCREMENT] \
__asm pminub reg1, reg5 \
__asm pminub reg2, reg6 \
__asm pmaxub reg1, [saddr] \
__asm pmaxub reg3, reg5 \
__asm pmaxub reg4, reg6 \
__asm pmaxub reg2, [saddr + SSE_INCREMENT] \
__asm pminub reg1, reg3 \
__asm pminub reg2, reg4 \
__asm SSE_RMOVE [daddr], reg1 \
__asm SSE_RMOVE [daddr + SSE_INCREMENT], reg2
#if ISSE > 1
#define UClensePixel(daddr, saddr, paddr, naddr, reg1, reg2, reg3, reg4, reg5, reg6, reg7, reg8) \
__asm SSE3_MOVE reg1, [naddr] \
__asm SSE3_MOVE reg2, [naddr + SSE_INCREMENT] \
__asm SSE_RMOVE reg3, reg1 \
__asm SSE3_MOVE reg5, [paddr] \
__asm SSE_RMOVE reg4, reg2 \
__asm SSE3_MOVE reg6, [paddr + SSE_INCREMENT] \
__asm pminub reg1, reg5 \
__asm pminub reg2, reg6 \
__asm SSE3_MOVE reg7, [saddr] \
__asm pmaxub reg3, reg5 \
__asm pmaxub reg4, reg6 \
__asm SSE3_MOVE reg8, [saddr + SSE_INCREMENT] \
__asm pmaxub reg1, reg7 \
__asm pmaxub reg2, reg8 \
__asm pminub reg1, reg3 \
__asm pminub reg2, reg4 \
__asm SSE_MOVE [daddr], reg1 \
__asm SSE_MOVE [daddr + SSE_INCREMENT], reg2
#else
#define UClensePixel(daddr, saddr, paddr, naddr, reg1, reg2, reg3, reg4, reg5, reg6, reg7, reg8) \
AClensePixel(daddr, saddr, paddr, naddr, reg1, reg2, reg3, reg4, reg5, reg6, reg7, reg8)
#endif
#endif // SHARPEN
{
__asm mov eax, incpitch
__asm mov ebx, pp
__asm add dpitch, eax
__asm add spitch, eax
__asm add ppitch, eax
__asm add npitch, eax
__asm mov esi, _sp
__asm mov edi, dp
__asm mov edx, remainder
__asm mov eax, np
__asm mov ecx, hblocks
__asm align 16
__asm _loop:
AClensePixel(edi, esi, ebx, eax, SSE0, SSE1, SSE2, SSE3, SSE4, SSE5, SSE6, SSE7)
__asm add eax, 2*SSE_INCREMENT
__asm add esi, 2*SSE_INCREMENT
__asm add edi, 2*SSE_INCREMENT
__asm add ebx, 2*SSE_INCREMENT
#if defined(SHARPEN) && (ISSE > 1)
__asm dec ecx
__asm jnz _loop
#else
__asm loop _loop
#endif
// the last pixels
UClensePixel(edi + edx, esi + edx, ebx + edx, eax + edx, SSE0, SSE1, SSE2, SSE3, SSE4, SSE5, SSE6, SSE7)
__asm add esi, spitch
__asm add edi, dpitch
__asm add ebx, ppitch
__asm add eax, npitch
__asm dec height
__asm mov ecx, hblocks
__asm jnz _loop
}
#if ISSE > 1
static inline void unaligned_clense(BYTE *dp, int dpitch, const BYTE *_sp, int spitch, const BYTE *pp, int ppitch, const BYTE *np, int npitch, int hblocks, int remainder, int incpitch, int height)
{
__asm mov eax, incpitch
__asm mov ebx, pp
__asm add dpitch, eax
__asm add spitch, eax
__asm add ppitch, eax
__asm add npitch, eax
__asm mov esi, _sp
__asm mov edi, dp
__asm mov edx, remainder
__asm mov eax, np
__asm mov ecx, hblocks
__asm align 16
__asm _loop:
UClensePixel(edi, esi, ebx, eax, SSE0, SSE1, SSE2, SSE3, SSE4, SSE5, SSE6, SSE7)
__asm add eax, 2*SSE_INCREMENT
__asm add esi, 2*SSE_INCREMENT
__asm add edi, 2*SSE_INCREMENT
__asm add ebx, 2*SSE_INCREMENT
#if defined(SHARPEN)
__asm dec ecx
__asm jnz _loop
#else
__asm loop _loop
#endif
// the last pixels
UClensePixel(edi + edx, esi + edx, ebx + edx, eax + edx, SSE0, SSE1, SSE2, SSE3, SSE4, SSE5, SSE6, SSE7)
__asm add esi, spitch
__asm add edi, dpitch
__asm add ebx, ppitch
__asm add eax, npitch
__asm dec height
__asm mov ecx, hblocks
__asm jnz _loop
}
static void clense(BYTE *dp, int dpitch, const BYTE *_sp, int spitch, const BYTE *pp, int ppitch, const BYTE *np, int npitch, int hblocks, int remainder, int incpitch, int height)
{
if( (((unsigned)dp & (SSE_INCREMENT - 1)) + ((unsigned)_sp & (SSE_INCREMENT - 1)) + ((unsigned)pp & (SSE_INCREMENT - 1)) + ((unsigned)np & (SSE_INCREMENT - 1))
#ifdef ALIGNPITCH
+ (spitch & (SSE_INCREMENT - 1)) + (ppitch & (SSE_INCREMENT - 1)) + (npitch & (SSE_INCREMENT - 1))
#endif
) == 0 ) aligned_clense(dp, dpitch, _sp, spitch, pp, ppitch, np, npitch, hblocks, remainder, incpitch, height);
else unaligned_clense(dp, dpitch, _sp, spitch, pp, ppitch, np, npitch, hblocks, remainder, incpitch, height);
}
#endif // ISSE > 1
class Clense : public GenericClense
{
PVideoFrame lframe;
unsigned lnr;
bool reduceflicker;
PVideoFrame __stdcall GetFrame(int n, IScriptEnvironment* env)
{
if( !reduceflicker || (lnr != n-1) )
{
if( n == 0 ) return child->GetFrame(n, env);
lframe = child->GetFrame(n - 1, env);
}
PVideoFrame sf = child->GetFrame(n, env);
if( n >= vi.num_frames ) return sf;
PVideoFrame nf = child->GetFrame(n + 1, env);
PVideoFrame df = env->NewVideoFrame(vi, 2*SSE_INCREMENT);
int i = planes;
do
{
clense(GetWritePtr(df, i), GetPitch(df, i), GetReadPtr(sf, i), GetPitch(sf, i), GetReadPtr(lframe, i), GetPitch(lframe, i), GetReadPtr(nf, i), GetPitch(nf, i), hblocks[i], remainder[i], incpitch[i], height[i]);
} while( --i >= 0 );
SSE_EMMS
lframe = df;
lnr = n;
return df;
}
public:
Clense(PClip clip, bool grey, bool _reduceflicker, bool planar, int cache)
: GenericClense(clip, grey, planar), reduceflicker(_reduceflicker), lframe(0), lnr(-2)
{
if( cache >= 0 ) child->SetCacheHints(CACHE_RANGE, cache);
}
//~Clense(){}
};
AVSValue __cdecl CreateClense(AVSValue args, void* user_data, IScriptEnvironment* env)
{
enum ARGS { CLIP, GREY, FLICKER, PLANAR, CACHE };
return new Clense(args[CLIP].AsClip(), args[GREY].AsBool(false), args[FLICKER].AsBool(true), args[PLANAR].AsBool(false), args[CACHE].AsInt(2));
};
class BMCClense : public GenericClense
{
PClip pclip, nclip;
PVideoFrame __stdcall GetFrame(int n, IScriptEnvironment* env)
{
PVideoFrame pf = pclip->GetFrame(n, env);
PVideoFrame sf = child->GetFrame(n, env);
PVideoFrame nf = nclip->GetFrame(n, env);
PVideoFrame df = env->NewVideoFrame(vi, 2*SSE_INCREMENT);
int i = planes;
do
{
clense(GetWritePtr(df, i), GetPitch(df, i), GetReadPtr(sf, i), GetPitch(sf, i), GetReadPtr(pf, i), GetPitch(pf, i), GetReadPtr(nf, i), GetPitch(nf, i), hblocks[i], remainder[i], incpitch[i], height[i]);
} while( --i >= 0 );
SSE_EMMS
return df;
}
public:
BMCClense(PClip clip, PClip _pclip, PClip _nclip, bool grey, bool planar) : GenericClense(clip, grey, planar), pclip(_pclip), nclip(_nclip)
{
child->SetCacheHints(CACHE_RANGE, 0);
pclip->SetCacheHints(CACHE_RANGE, 0);
nclip->SetCacheHints(CACHE_RANGE, 0);
CompareVideoInfo(vi, pclip->GetVideoInfo(), "MCClense");
CompareVideoInfo(vi, nclip->GetVideoInfo(), "MCClense");
}
//~BMCClense(){}
};
AVSValue __cdecl CreateMCClense(AVSValue args, void* user_data, IScriptEnvironment* env)
{
return new BMCClense(args[0].AsClip(), args[1].AsClip(), args[2].AsClip(), args[3].AsBool(false), args[4].AsBool(false));
};
#if ISSE > 1
static void aligned_sclense(BYTE *dp, int dpitch, const BYTE *_sp, int spitch, const BYTE *pp, int ppitch, const BYTE *np, int npitch, int hblocks, int remainder, int incpitch, int height)
#else
static void sclense(BYTE *dp, int dpitch, const BYTE *_sp, int spitch, const BYTE *pp, int ppitch, const BYTE *np, int npitch, int hblocks, int remainder, int incpitch, int height)
#endif
#define ASClensePixel(daddr, saddr, paddr, naddr, reg1, reg2, reg3, reg4, reg5, reg6, reg7, reg8) \
__asm SSE_RMOVE reg1, [paddr] \
__asm SSE_RMOVE reg2, [paddr + SSE_INCREMENT] \
__asm SSE_RMOVE reg3, reg1 \
__asm SSE_RMOVE reg5, [naddr] \
__asm SSE_RMOVE reg4, reg2 \
__asm SSE_RMOVE reg6, [naddr + SSE_INCREMENT] \
__asm pminub reg1, reg5 \
__asm pminub reg2, reg6 \
__asm pmaxub reg3, reg5 \
__asm pmaxub reg4, reg6 \
__asm SSE_RMOVE reg7, reg3 \
__asm SSE_RMOVE reg8, reg4 \
__asm psubusb reg7, reg5 \
__asm psubusb reg8, reg6 \
__asm psubusb reg5, reg1 \
__asm psubusb reg6, reg2 \
__asm psubusb reg1, reg5 \
__asm psubusb reg2, reg6 \
__asm pmaxub reg1, [saddr] \
__asm paddusb reg3, reg7 \
__asm paddusb reg4, reg8 \
__asm pmaxub reg2, [saddr + SSE_INCREMENT] \
__asm pminub reg1, reg3 \
__asm pminub reg2, reg4 \
__asm SSE_RMOVE [daddr], reg1 \
__asm SSE_RMOVE [daddr + SSE_INCREMENT], reg2
#if ISSE > 1
#define USClensePixel(daddr, saddr, paddr, naddr, reg1, reg2, reg3, reg4, reg5, reg6, reg7, reg8) \
__asm SSE3_MOVE reg1, [paddr] \
__asm SSE3_MOVE reg2, [paddr + SSE_INCREMENT] \
__asm SSE_RMOVE reg3, reg1 \
__asm SSE3_MOVE reg5, [naddr] \
__asm SSE_RMOVE reg4, reg2 \
__asm SSE3_MOVE reg6, [naddr + SSE_INCREMENT] \
__asm pminub reg1, reg5 \
__asm pminub reg2, reg6 \
__asm pmaxub reg3, reg5 \
__asm pmaxub reg4, reg6 \
__asm SSE_RMOVE reg7, reg3 \
__asm SSE_RMOVE reg8, reg4 \
__asm psubusb reg7, reg5 \
__asm psubusb reg8, reg6 \
__asm psubusb reg5, reg1 \
__asm psubusb reg6, reg2 \
__asm psubusb reg1, reg5 \
__asm psubusb reg2, reg6 \
__asm SSE3_MOVE reg5, [saddr] \
__asm paddusb reg3, reg7 \
__asm paddusb reg4, reg8 \
__asm SSE3_MOVE reg6, [saddr + SSE_INCREMENT] \
__asm pmaxub reg1, reg5 \
__asm pmaxub reg2, reg6 \
__asm pminub reg1, reg3 \
__asm pminub reg2, reg4 \
__asm SSE_MOVE [daddr], reg1 \
__asm SSE_MOVE [daddr + SSE_INCREMENT], reg2
#else
#define USClensePixel(daddr, saddr, paddr, naddr, reg1, reg2, reg3, reg4, reg5, reg6, reg7, reg8) \
ASClensePixel(daddr, saddr, paddr, naddr, reg1, reg2, reg3, reg4, reg5, reg6, reg7, reg8)
#endif
{
__asm mov eax, incpitch
__asm mov ebx, pp
__asm add dpitch, eax
__asm add spitch, eax
__asm add ppitch, eax
__asm add npitch, eax
__asm mov esi, _sp
__asm mov edi, dp
__asm mov edx, remainder
__asm mov eax, np
__asm mov ecx, hblocks
__asm align 16
__asm _loop:
ASClensePixel(edi, esi, ebx, eax, SSE0, SSE1, SSE2, SSE3, SSE4, SSE5, SSE6, SSE7)
__asm add eax, 2*SSE_INCREMENT
__asm add esi, 2*SSE_INCREMENT
__asm add edi, 2*SSE_INCREMENT
__asm add ebx, 2*SSE_INCREMENT
__asm loop _loop
// the last pixels
USClensePixel(edi + edx, esi + edx, ebx + edx, eax + edx, SSE0, SSE1, SSE2, SSE3, SSE4, SSE5, SSE6, SSE7)
__asm add esi, spitch
__asm add edi, dpitch
__asm add ebx, ppitch
__asm add eax, npitch
__asm dec height
__asm mov ecx, hblocks
__asm jnz _loop
}
#if ISSE > 1
static void unaligned_sclense(BYTE *dp, int dpitch, const BYTE *_sp, int spitch, const BYTE *pp, int ppitch, const BYTE *np, int npitch, int hblocks, int remainder, int incpitch, int height)
{
__asm mov eax, incpitch
__asm mov ebx, pp
__asm add dpitch, eax
__asm add spitch, eax
__asm add ppitch, eax
__asm add npitch, eax
__asm mov esi, _sp
__asm mov edi, dp
__asm mov edx, remainder
__asm mov eax, np
__asm mov ecx, hblocks
__asm align 16
__asm _loop:
USClensePixel(edi, esi, ebx, eax, SSE0, SSE1, SSE2, SSE3, SSE4, SSE5, SSE6, SSE7)
__asm add eax, 2*SSE_INCREMENT
__asm add esi, 2*SSE_INCREMENT
__asm add edi, 2*SSE_INCREMENT
__asm add ebx, 2*SSE_INCREMENT
#if ISSE > 1
__asm dec ecx
__asm jnz _loop
#else
__asm loop _loop
#endif
// the last pixels
USClensePixel(edi + edx, esi + edx, ebx + edx, eax + edx, SSE0, SSE1, SSE2, SSE3, SSE4, SSE5, SSE6, SSE7)
__asm add esi, spitch
__asm add edi, dpitch
__asm add ebx, ppitch
__asm add eax, npitch
__asm dec height
__asm mov ecx, hblocks
__asm jnz _loop
}
static void sclense(BYTE *dp, int dpitch, const BYTE *_sp, int spitch, const BYTE *pp, int ppitch, const BYTE *np, int npitch, int hblocks, int remainder, int incpitch, int height)
{
if( (((unsigned)dp & (SSE_INCREMENT - 1)) + ((unsigned)_sp & (SSE_INCREMENT - 1)) + ((unsigned)pp & (SSE_INCREMENT - 1)) + ((unsigned)np & (SSE_INCREMENT - 1))
#ifdef ALIGNPITCH
+ (spitch & (SSE_INCREMENT - 1)) + (ppitch & (SSE_INCREMENT - 1)) + (npitch & (SSE_INCREMENT - 1))
#endif
) == 0 ) aligned_sclense(dp, dpitch, _sp, spitch, pp, ppitch, np, npitch, hblocks, remainder, incpitch, height);
else unaligned_sclense(dp, dpitch, _sp, spitch, pp, ppitch, np, npitch, hblocks, remainder, incpitch, height);
}
#endif
class BackwardClense : public GenericClense
{
PVideoFrame __stdcall GetFrame(int n, IScriptEnvironment* env)
{
PVideoFrame sf = child->GetFrame(n, env);
if( n < 2 ) return sf;
PVideoFrame next1 = child->GetFrame(n - 1, env);
PVideoFrame next2 = child->GetFrame(n - 2, env);
PVideoFrame df = env->NewVideoFrame(vi, 2*SSE_INCREMENT);
int i = planes;
do
{
sclense(GetWritePtr(df, i), GetPitch(df, i), GetReadPtr(sf, i), GetPitch(sf, i), GetReadPtr(next1, i), GetPitch(next1, i), GetReadPtr(next2, i), GetPitch(next2, i), hblocks[i], remainder[i], incpitch[i], height[i]);
} while( --i >= 0 );
SSE_EMMS
return df;
}
public:
BackwardClense(PClip clip, bool grey, bool planar, int cache) : GenericClense(clip, grey, planar)
{
if( cache >= 0 ) child->SetCacheHints(CACHE_RANGE, cache);
}
};
class ForwardClense : public BackwardClense
{
int lastnr;
PVideoFrame __stdcall GetFrame(int n, IScriptEnvironment* env)
{
PVideoFrame sf = child->GetFrame(n, env);
if( n >= lastnr ) return sf;
PVideoFrame next1 = child->GetFrame(n + 1, env);
PVideoFrame next2 = child->GetFrame(n + 2, env);
PVideoFrame df = env->NewVideoFrame(vi, 2*SSE_INCREMENT);
int i = planes;
do
{
sclense(GetWritePtr(df, i), GetPitch(df, i), GetReadPtr(sf, i), GetPitch(sf, i), GetReadPtr(next1, i), GetPitch(next1, i), GetReadPtr(next2, i), GetPitch(next2, i), hblocks[i], remainder[i], incpitch[i], height[i]);
} while( --i >= 0 );
SSE_EMMS
return df;
}
public:
ForwardClense(PClip clip, bool grey, bool planar, int cache) : BackwardClense(clip, grey, planar, cache), lastnr(vi.num_frames - 2)
{}
};
char clenseargs[] ="c[grey]b[planar]b[cache]i";
AVSValue __cdecl CreateBackwardClense(AVSValue args, void* user_data, IScriptEnvironment* env)
{
enum ARGS { CLIP, GREY, PLANAR, CACHE };
return new BackwardClense(args[CLIP].AsClip(), args[GREY].AsBool(false), args[PLANAR].AsBool(false), args[CACHE].AsInt(2));
};
AVSValue __cdecl CreateForwardClense(AVSValue args, void* user_data, IScriptEnvironment* env)
{
enum ARGS { CLIP, GREY, PLANAR, CACHE };
return new ForwardClense(args[CLIP].AsClip(), args[GREY].AsBool(false), args[PLANAR].AsBool(false), args[CACHE].AsInt(2));
};
#endif // MODIFYPLUGIN
extern "C" __declspec(dllexport) const char* __stdcall AvisynthPluginInit2(IScriptEnvironment* env)
{
#ifdef MODIFYPLUGIN
#ifdef DEBUG_NAME
env->AddFunction("DRepair", "cc[mode]i[modeU]i[modeV]i[planar]b", CreateRemoveGrain, 0);
env->AddFunction("DTemporalRepair", "cc[mode]i[smooth]i[grey]b[planar]b", CreateTemporalRepair, 0);
#else
env->AddFunction("Repair", "cc[mode]i[modeU]i[modeV]i[planar]b", CreateRemoveGrain, 0);
env->AddFunction("TemporalRepair", "cc[mode]i[smooth]i[grey]b[planar]b", CreateTemporalRepair, 0);
#endif
#elif SHARPEN == 1
#ifdef DEBUG_NAME
env->AddFunction("DRSharpen", "c[mode]i[modeU]i[modeV]i[strength]i[strengthU]i[strengthV]i[planar]b", CreateRemoveGrain, 0);
env->AddFunction("DMotionSharpen", "c[grey]b[reduceflicker]b[planar]b[cache]i", CreateClense, 0);
#else
env->AddFunction("RSharpen", "c[mode]i[modeU]i[modeV]i[strength]i[strengthU]i[strengthV]i[planar]b", CreateRemoveGrain, 0);
env->AddFunction("MotionSharpen", "c[grey]b[reduceflicker]b[planar]b[cache]i", CreateClense, 0);
#endif
#elif SHARPEN > 1
#ifdef DEBUG_NAME
env->AddFunction("DDenoiseSharpen", "c[mode]i[modeU]i[modeV]i[strength]i[strengthU]i[strengthV]i[planar]b", CreateRemoveGrain, 0);
env->AddFunction("DClenseSharpen", "c[grey]b[reduceflicker]b[planar]b[cache]i", CreateClense, 0);
#else
env->AddFunction("DenoiseSharpen", "c[mode]i[modeU]i[modeV]i[strength]i[strengthU]i[strengthV]i[planar]b", CreateRemoveGrain, 0);
env->AddFunction("ClenseSharpen", "c[grey]b[reduceflicker]b[planar]b[cache]i", CreateClense, 0);
#endif
#elif BLUR == 1
#ifdef DEBUG_NAME
env->AddFunction("DRBlur", "c[mode]i[modeU]i[modeV]i[planar]b", CreateRemoveGrain, 0);
#else
env->AddFunction("RBlur", "c[mode]i[modeU]i[modeV]i[planar]b", CreateRemoveGrain, 0);
#endif
#elif BLUR > 1
#ifdef DEBUG_NAME
env->AddFunction("DDenoiseBlur", "c[mode]i[modeU]i[modeV]i[planar]b", CreateRemoveGrain, 0);
#else
env->AddFunction("DenoiseBlur", "c[mode]i[modeU]i[modeV]i[planar]b", CreateRemoveGrain, 0);
#endif
#else // MODIFYPLUGIN
#ifdef DEBUG_NAME
env->AddFunction("DRemoveGrain", "c[mode]i[modeU]i[modeV]i[planar]b", CreateRemoveGrain, 0);
env->AddFunction("DClense", "c[grey]b[reduceflicker]b[planar]b[cache]i", CreateClense, 0);
env->AddFunction("DMCClense", "ccc[grey]b[planar]b", CreateMCClense, 0);
env->AddFunction("DBackwardClense", clenseargs, CreateBackwardClense, 0);
env->AddFunction("DForwardClense", clenseargs, CreateForwardClense, 0);
#else
env->AddFunction("RemoveGrain", "c[mode]i[modeU]i[modeV]i[planar]b", CreateRemoveGrain, 0);
env->AddFunction("Clense", "c[grey]b[reduceflicker]b[planar]b[cache]i", CreateClense, 0);
env->AddFunction("MCClense", "ccc[grey]b[planar]b", CreateMCClense, 0);
env->AddFunction("BackwardClense", clenseargs, CreateBackwardClense, 0);
env->AddFunction("ForwardClense", clenseargs, CreateForwardClense, 0);
#endif
#endif // MODIFYPLUGIN
AVSenvironment = env;
if( (CPUFLAGS & env->GetCPUFlags()) != CPUFLAGS )
#if ISSE > 1
env->ThrowError("RemoveGrain needs an SSE2 capable cpu!\n");
#else
env->ThrowError("RemoveGrain needs an SSE capable cpu!\n");
#endif
#if 0
debug_printf(LOGO);
#endif
return "RemoveGrain: remove grain from film";
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment