chikuzen · March 15, 2016 18:11
diff --git a/RemoveGrain.cpp b/RemoveGrain.cpp
 #define LOGO		"RemoveGrain 1.0\n"
 // An Avisynth plugin for removing grain from progressive video
 //
 // By Rainer Wittmann <[email protected]>
 //
 // This program is free software; you can redistribute it and/or modify
 // it under the terms of the GNU General Public License as published by
 // the Free Software Foundation; either version 2 of the License, or
 // (at your option) any later version.
 //
 // This program is distributed in the hope that it will be useful,
 // but WITHOUT ANY WARRANTY; without even the implied warranty of
 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 // GNU General Public License for more details.
 //
 // To get a copy of the GNU General Public License write to the Free Software
 // Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA, or visit
 // http://www.gnu.org/copyleft/gpl.html .

 //#define	MODIFYPLUGIN	1// creat Repair plugin instead of RemoveGrain, 0 = compatible with RemoveGrain
 //#define	SHARPEN	1
 //#define	BLUR 1
 //#define	SSE2_TEST		// ISSE2 version that can be used side by side with the SSE version
 //#define	DEBUG_NAME		// for debugging
 //#define	ISSE	2		// P4, Athlon 64, Sempron 3100
 //#define	ISSE	3		// Prescott P4	
 //#define	CVERSION		// for debugging only
 #define	ALIGNPITCH
 #define	SMOOTH2

 #define	DEFAULT_MODE	2
 #define	DEFAULT_RGLIMIT	0

 #define VC_EXTRALEAN 
 #include <Windows.h>
 #include <stdio.h>
 #include <stdarg.h>

 #include "avisynth.h"
 #include "planar.h"

 static	IScriptEnvironment	*AVSenvironment;

 #ifdef	SSE2_TEST
 #ifndef	ISSE
 #define	ISSE	2
 #endif
 #ifndef	DEBUG_NAME
 #define	DEBUG_NAME
 #endif
 #endif

 #ifndef	ISSE
 #define	ISSE	1
 #endif

 #if		ISSE > 1			
 #define	CPUFLAGS		CPUF_SSE2
 #else
 #define	CPUFLAGS		CPUF_INTEGER_SSE
 #endif

 #ifdef	MODIFYPLUGIN
 #define	MAXMODE			18
 #elif defined(SHARPEN)
 #define	MAXMODE			22
 #define	MAXSTRENGTH		2
 #define	DEFAULT_STRENGTH	1
 #else
 #define	MAXMODE			28
 #endif

 #if	defined(SHARPEN) && defined(MODIFYPLUGIN)
 #error "SHARPEN cannot be combined with MODIFYPLUGIN"
 #endif

 #if	defined(BLUR) && defined(MODIFYPLUGIN)
 #error "SHARPEN cannot be combined with MODIFYPLUGIN"
 #endif

 #if	1
 void	debug_printf(const char *format, ...)
 {
 	char	buffer[200];
 	va_list	args;
 	va_start(args, format);
 	vsprintf(buffer, format, args);
 	va_end(args);
 	OutputDebugString(buffer);	
 }
 #endif

 #define	COMPARE_MASK	(~24)

 static	void CompareVideoInfo(VideoInfo &vi1, const VideoInfo &vi2, const char *progname)
 {	
 	if( (vi1.width != vi2.width) || (vi1.height != vi2.height) || ( (vi1.pixel_type & COMPARE_MASK) != (vi2.pixel_type & COMPARE_MASK) ))
 	{
 #if	1
 		debug_printf("widths = %u, %u, heights = %u, %u, color spaces = %X, %X\n"
 						, vi1.width, vi2.width, vi1.height, vi2.height, vi1.pixel_type, vi2.pixel_type);
 #endif
 		AVSenvironment->ThrowError("%s: clips must be of equal type", progname);
 	}
 	if(vi1.num_frames > vi2.num_frames) vi1.num_frames = vi2.num_frames;
 }

 #ifdef	TESTCOMPARE
 unsigned	testcompare(const BYTE *dp, int dpitch, const BYTE *pp, int ppitch, int width, int height)
 {
 	int i = height;
 	--dp; --pp;
 	unsigned	diffsum = 0;
 	do
 	{
 		int j = width;
 		do
 		{
 			int	diff = dp[j] - pp[j];
 			if( diff < 0 ) diff = -diff;
 			diffsum += diff;
 		} while( --j );
 		dp += dpitch;
 		pp += ppitch;
 	} while( --i );
 	return	diffsum;
 }

 #define	xpitch	1
 void	RemoveGrain(BYTE *dp, int dpitch, const BYTE *sp, int spitch, int width, int height, int threshold)
 {
 	int	sinc = - (width + 1) * xpitch;
 	dpitch += sinc;
 	sinc += spitch;

 	do
 	{
 		dp[0] = sp[0];
 		dp += xpitch; sp += xpitch;
 		int	i = width;
 		do
 		{
 			unsigned	sort1[8];
 			int	leq = 0;
 			int	geq = 0;
 			unsigned	x = sp[0];
 			if( (sort1[0] = (sp += xpitch)[0]) <= x )
 			{
 				if( sort1[0] == x ) ++geq;
 				++leq;
 			}
 			if( (sort1[1] = (sp += spitch)[0]) <= x )
 			{
 				if( sort1[1] == x ) ++geq;
 				++leq;
 			}
 			if( (sort1[2] = (sp -= xpitch)[0]) <= x )
 			{
 				if( sort1[2] == x ) ++geq;
 				++leq;
 			}
 			if( (sort1[3] = (sp -= xpitch)[0]) <= x )
 			{
 				if( sort1[3] >= x ) ++geq;
 				++leq;
 			}
 			if( (sort1[4] = (sp -= spitch)[0]) <= x )
 			{
 				if( sort1[4] >= x ) ++geq;
 				++leq;
 			}
 			if( (sort1[5] = (sp -= spitch)[0]) <= x )
 			{
 				if( sort1[5] >= x ) ++geq;
 				++leq;
 			}
 			if( (sort1[6] = (sp += xpitch)[0]) <= x )
 			{
 				if( sort1[6] >= x ) ++geq;
 				++leq;
 			}
 			if( (sort1[7] = (sp += xpitch)[0]) <= x )
 			{
 				if( sort1[7] >= x ) ++geq;
 				++leq;
 			}

 			if( ((geq += 8 - leq) < threshold) || (leq < threshold) )
 			{ // do a merge sort of sort1[8] as fast as possible
 				unsigned sort2[8];
 				if( sort1[1] < sort1[0] ) 
 				{
 					sort2[0] = sort1[1];
 					sort2[1] = sort1[0];
 				}
 				else
 				{
 					sort2[0] = sort1[0];
 					sort2[1] = sort1[1];
 				}
 				if( sort1[3] < sort1[2] ) 
 				{
 					sort2[2] = sort1[3];
 					sort2[3] = sort1[2];
 				}
 				else
 				{
 					sort2[2] = sort1[2];
 					sort2[3] = sort1[3];
 				}
 				if( sort1[5] < sort1[4] ) 
 				{
 					sort2[4] = sort1[5];
 					sort2[5] = sort1[4];
 				}
 				else
 				{
 					sort2[4] = sort1[4];
 					sort2[5] = sort1[5];
 				}
 				if( sort1[7] < sort1[6] ) 
 				{
 					sort2[6] = sort1[7];
 					sort2[7] = sort1[6];
 				}
 				else
 				{
 					sort2[6] = sort1[6];
 					sort2[7] = sort1[7];
 				}

 				if( sort2[0] > sort2[2] )
 				{
 					sort1[0] = sort2[2];
 					if( sort2[3] <= sort2[0] )
 					{
 						sort1[1] = sort2[3];
 						sort1[2] = sort2[0];
 						sort1[3] = sort2[1];
 					}
 					else
 					{
 						sort1[1] = sort2[0];
 						if( sort2[1] < sort2[3] )
 						{
 							sort1[2] = sort2[1];
 							sort1[3] = sort2[3];
 						}
 						else
 						{
 							sort1[2] = sort2[3];
 							sort1[3] = sort2[1];
 						}
 					}
 				}
 				else
 				{
 					sort1[0] = sort2[0];
 					if( sort2[1] <= sort2[2] )
 					{
 						sort1[1] = sort2[1];
 						sort1[2] = sort2[2];
 						sort1[3] = sort2[3];
 					}
 					else
 					{
 						sort1[1] = sort2[2];
 						if( sort2[3] < sort2[1] )
 						{
 							sort1[2] = sort2[3];
 							sort1[3] = sort2[1];
 						}
 						else
 						{
 							sort1[2] = sort2[1];
 							sort1[3] = sort2[3];
 						}
 					}
 				}
 #if	0
 				if( (sort1[0] > sort1[1]) || (sort1[1] > sort1[2]) || (sort1[2] > sort1[3]) )
 					debug_printf("merge error: sort = %u, %u, %u, %u\n", sort1[0], sort1[1], sort1[2], sort1[3]);
 #endif

 				if( sort2[4] > sort2[6] )
 				{
 					sort1[4] = sort2[6];
 					if( sort2[7] <= sort2[4] )
 					{
 						sort1[5] = sort2[7];
 						sort1[6] = sort2[4];
 						sort1[7] = sort2[5];
 					}
 					else
 					{
 						sort1[5] = sort2[4];
 						if( sort2[5] < sort2[7] )
 						{
 							sort1[6] = sort2[5];
 							sort1[7] = sort2[7];
 						}
 						else
 						{
 							sort1[6] = sort2[7];
 							sort1[7] = sort2[5];
 						}
 					}
 				}
 				else
 				{
 					sort1[4] = sort2[4];
 					if( sort2[5] <= sort2[6] )
 					{
 						sort1[5] = sort2[5];
 						sort1[6] = sort2[6];
 						sort1[7] = sort2[7];
 					}
 					else
 					{
 						sort1[5] = sort2[6];
 						if( sort2[7] < sort2[5] )
 						{
 							sort1[6] = sort2[7];
 							sort1[7] = sort2[5];
 						}
 						else
 						{
 							sort1[6] = sort2[5];
 							sort1[7] = sort2[7];
 						}
 					}
 				}
 #if	0
 				if( (sort1[4] > sort1[5]) || (sort1[5] > sort1[6]) || (sort1[6] > sort1[7]) )
 					debug_printf("merge error: sort = %u, %u, %u, %u\n", sort1[4], sort1[5], sort1[6], sort1[7]);
 #endif

 				unsigned *s1 = sort1, *s2 = sort1 + 4, *t = sort2;
 				*t++ = *s1	> *s2 ? *s2++ : *s1++;
 				*t++ = *s1	> *s2 ? *s2++ : *s1++;
 				*t++ = *s1	> *s2 ? *s2++ : *s1++;	
 				if( sort1[3] > sort1[7] )
 				{
 					do
 					{
 						*t++ = *s1 > *s2 ? *s2++ : *s1++;
 					} while( s2 != sort1 + 8 );
 					do
 					{
 						*t++ = *s1++;
 					} while( s1 != sort1 + 4 );
 				}
 				else
 				{
 					do
 					{
 						*t++ = *s1 > *s2 ? *s2++ : *s1++;
 					} while( s1 != sort1 + 4 );
 					do
 					{
 						*t++ = *s2++;
 					} while( s2 != sort1 + 8 );
 				}
 				
 #if	0
 				if( (leq > 0) && (sort2[leq - 1] > x) ) debug_printf("leq = %u, x = %u, sort = %u,%u,%u,%u,%u,%u,%u,%u\n", leq, x, sort2[0], sort2[1], sort2[2], sort2[3], sort2[4], sort2[5], sort2[6], sort2[7]);
 				if( (leq < 8) && (sort2[leq] <= x) ) debug_printf("leq = %u, x = %u, sort = %u,%u,%u,%u,%u,%u,%u,%u\n", leq, x, sort2[0], sort2[1], sort2[2], sort2[3], sort2[4], sort2[5], sort2[6], sort2[7]);
 				if( (geq > 0) && (sort2[8 - geq] < x) ) debug_printf("geq = %u, x = %u, sort = %u,%u,%u,%u,%u,%u,%u,%u\n", geq, x, sort2[0], sort2[1], sort2[2], sort2[3], sort2[4], sort2[5], sort2[6], sort2[7]);
 				if( (geq < 8) && (sort2[7 - geq] >= x) ) debug_printf("geq = %u, x = %u, sort = %u,%u,%u,%u,%u,%u,%u,%u\n", geq, x, sort2[0], sort2[1], sort2[2], sort2[3], sort2[4], sort2[5], sort2[6], sort2[7]);
 #endif
 				x = leq < threshold ? sort2[threshold - 1] : sort2[8 - threshold];
 			}
 			dp[0] = x;
 			dp += xpitch;
 			sp += spitch;
 		} while( --i );
 		dp[0] = sp[0];
 		dp += dpitch; sp += sinc;
 	} while( --height );
 }
 #undef	xpitch
 #endif	// TESTCOMPARE

 #if		ISSE > 1
 #define	SSE_INCREMENT	16
 #define	SSE_SHIFT		4
 #define	SSE_MOVE		movdqu
 #if		ISSE > 2
 #define	SSE3_MOVE		lddqu
 #else
 #define	SSE3_MOVE		movdqu
 #endif
 #define	SSE_RMOVE		movdqa
 #define	SSE0			xmm0
 #define	SSE1			xmm1
 #define	SSE2			xmm2
 #define	SSE3			xmm3
 #define	SSE4			xmm4
 #define	SSE5			xmm5
 #define	SSE6			xmm6
 #define	SSE7			xmm7
 #define	SSE_EMMS	
 #else
 #define	SSE_INCREMENT	8
 #define	SSE_SHIFT		3
 #define	SSE_MOVE		movq
 #define	SSE3_MOVE		movq
 #define	SSE_RMOVE		movq
 #define	SSE0			mm0
 #define	SSE1			mm1
 #define	SSE2			mm2
 #define	SSE3			mm3
 #define	SSE4			mm4
 #define	SSE5			mm5
 #define	SSE6			mm6
 #define	SSE7			mm7
 #define	SSE_EMMS		__asm	emms
 #endif	// ISSE

 #if	defined(SHARPEN) || defined(BLUR)
 #define	SHLUR
 #endif

 #if	BLUR == 1
 #define	blur(center, min, max, reg1, reg2)\
 __asm	SSE_RMOVE	reg2,				center			\
 __asm	psubusb		max,				center			\
 __asm	psubusb		reg2,				min				\
 __asm	SSE_RMOVE	reg1,				max				\
 __asm	SSE_RMOVE	min,				reg2			\
 __asm	psubusb		max,				reg2			\
 __asm	psubusb		min,				reg1			\
 __asm	psrlw		max,				1				\
 __asm	psrlw		min,				1				\
 __asm	pminub		reg2,				max				\
 __asm	pminub		reg1,				min				\
 __asm	paddusb		center,				reg2			\
 __asm	psubusb		center,				reg1
 #elif	BLUR == 2
 __asm	pminub		center,				max				\
 __asm	pmaxub		center,				min				\
 __asm	SSE_RMOVE	reg2,				center			\
 __asm	psubusb		max,				center			\
 __asm	psubusb		reg2,				min				\
 __asm	SSE_RMOVE	reg1,				max				\
 __asm	SSE_RMOVE	min,				reg2			\
 __asm	psubusb		max,				reg2			\
 __asm	psubusb		min,				reg1			\
 __asm	psrlw		max,				1				\
 __asm	psrlw		min,				1				\
 __asm	pminub		reg2,				max				\
 __asm	pminub		reg1,				min				\
 __asm	paddusb		center,				reg2			\
 __asm	psubusb		center,				reg1
 #endif

 #ifdef	SHARPEN

 static	const __declspec(align(SSE_INCREMENT)) unsigned short rshift[3][SSE_INCREMENT / 2] =
 {
 	{
 		0,0,0,0		
 #if	SSE_INCREMENT == 16
 		,0,0,0,0		
 #endif
 	},
 	{
 		1,0,0,0		
 #if	SSE_INCREMENT == 16
 		, 0,0,0,0			
 #endif
 	},
 	{
 		2,0,0,0		
 #if	SSE_INCREMENT == 16
 		, 0,0,0,0
 #endif
 	}
 };

 #define	SHIFT_MASK0	255
 #define	SHIFT_MASK1	127
 #define	SHIFT_MASK2	63
 static	const __declspec(align(SSE_INCREMENT)) BYTE	shift_mask[3][SSE_INCREMENT] =
 {
 	{
 		SHIFT_MASK0, SHIFT_MASK0, SHIFT_MASK0, SHIFT_MASK0, SHIFT_MASK0, SHIFT_MASK0, SHIFT_MASK0, SHIFT_MASK0		
 #if	SSE_INCREMENT == 16
 		, SHIFT_MASK0, SHIFT_MASK0, SHIFT_MASK0, SHIFT_MASK0, SHIFT_MASK0, SHIFT_MASK0, SHIFT_MASK0, SHIFT_MASK0
 #endif
 	},
 	{
 		SHIFT_MASK1, SHIFT_MASK1, SHIFT_MASK1, SHIFT_MASK1, SHIFT_MASK1, SHIFT_MASK1, SHIFT_MASK1, SHIFT_MASK1		
 #if	SSE_INCREMENT == 16
 		, SHIFT_MASK1, SHIFT_MASK1, SHIFT_MASK1, SHIFT_MASK1, SHIFT_MASK1, SHIFT_MASK1, SHIFT_MASK1, SHIFT_MASK1
 #endif
 	},
 	{
 		SHIFT_MASK2, SHIFT_MASK2, SHIFT_MASK2, SHIFT_MASK2, SHIFT_MASK2, SHIFT_MASK2, SHIFT_MASK2, SHIFT_MASK2		
 #if	SSE_INCREMENT == 16
 		, SHIFT_MASK2, SHIFT_MASK2, SHIFT_MASK2, SHIFT_MASK2, SHIFT_MASK2, SHIFT_MASK2, SHIFT_MASK2, SHIFT_MASK2
 #endif
 	}
 };

 #if	SHARPEN == 1
 // only sharpen
 #define	sharpen(center, min, max, rshift, SHIFT_MASK1, reg1, reg2)\
 __asm	SSE_RMOVE	reg2,				center			\
 __asm	psubusb		max,				center			\
 __asm	psubusb		reg2,				min				\
 __asm	SSE_RMOVE	reg1,				max				\
 __asm	SSE_RMOVE	min,				reg2			\
 __asm	psubusb		max,				reg2			\
 __asm	psubusb		min,				reg1			\
 __asm	psrlw		reg2,				rshift			\
 __asm	psrlw		reg1,				rshift			\
 __asm	pand		reg2,				SHIFT_MASK1		\
 __asm	pand		reg1,				SHIFT_MASK1		\
 __asm	pminub		reg2,				max				\
 __asm	pminub		reg1,				min				\
 __asm	psubusb		center,				reg2			\
 __asm	paddusb		center,				reg1

 #elif	SHARPEN == 2
 // clip and sharpen
 #define	sharpen(center, min, max, rshift, SHIFT_MASK1, reg1, reg2)\
 __asm	pminub		center,				max				\
 __asm	pmaxub		center,				min				\
 __asm	SSE_RMOVE	reg2,				center			\
 __asm	psubusb		max,				center			\
 __asm	psubusb		reg2,				min				\
 __asm	SSE_RMOVE	reg1,				max				\
 __asm	SSE_RMOVE	min,				reg2			\
 __asm	psubusb		max,				reg2			\
 __asm	psubusb		min,				reg1			\
 __asm	psrlw		reg2,				rshift			\
 __asm	psrlw		reg1,				rshift			\
 __asm	pand		reg2,				SHIFT_MASK1		\
 __asm	pand		reg1,				SHIFT_MASK1		\
 __asm	pminub		reg2,				max				\
 __asm	pminub		reg1,				min				\
 __asm	psubusb		center,				reg2			\
 __asm	paddusb		center,				reg1
 #endif
 #endif	// SHARPEN

 #ifdef	BLUR
 #define	sharpen(center, min, max, rshift, SHIFT_MASK1, reg1, reg2)	blur(center, min, max, reg1, reg2)
 #endif

 #ifdef	SHARPEN

 void	do_nothing(BYTE *dp, int dpitch, const BYTE *sp, int spitch, int hblocks, int remainder, int incpitch, int height, int strength)
 {
 }

 void	copy_plane(BYTE *dp, int dpitch, const BYTE *sp, int spitch, int hblocks, int remainder, int incpitch, int height, int strength)
 {
 	AVSenvironment->BitBlt(dp, dpitch, sp, spitch, hblocks * SSE_INCREMENT + 2 * (SSE_INCREMENT + 1) + remainder, height);
 }

 #else // SHARPEN

 void	do_nothing(BYTE *dp, int dpitch, const BYTE *sp, int spitch, int hblocks, int remainder, int incpitch, int height)
 {
 }

 void	copy_plane(BYTE *dp, int dpitch, const BYTE *sp, int spitch, int hblocks, int remainder, int incpitch, int height)
 {
 	AVSenvironment->BitBlt(dp, dpitch, sp, spitch, hblocks * SSE_INCREMENT + 2 * (SSE_INCREMENT + 1) + remainder, height);
 }
 #endif	// SHARPEN

 #define	ins2(first, second, reg)						\
 __asm	pmaxub		second,				reg				\
 __asm	pminub		second,				first			\
 __asm	pmaxub		first,				reg

 #define	ins3(first, second, third, reg)					\
 __asm	pmaxub		third,				reg				\
 __asm	pminub		third,				second			\
 		ins2(first, second, reg)

 #define	ins4(first, second, third, fourth, reg)			\
 __asm	pmaxub		fourth,				reg				\
 __asm	pminub		fourth,				third			\
 		ins3(first, second, third, reg)

 #define	ins5(first, second, third, fourth, fifth, reg)	\
 __asm	pmaxub		fifth,				reg				\
 __asm	pminub		fifth,				fourth			\
 		ins4(first, second, third, fourth, reg)

 #define	ins6(first, second, third, fourth, fifth, sixth, reg)	\
 __asm	pmaxub		sixth,				reg				\
 __asm	pminub		sixth,				fifth			\
 		ins5(first, second, third, fourth, fifth, reg)

 #define	add2(first, second, reg)						\
 __asm	SSE_RMOVE	second,				reg				\
 __asm	pminub		second,				first			\
 __asm	pmaxub		first,				reg	

 #define	add3(first, second, third, reg)					\
 __asm	SSE_RMOVE	third,				reg				\
 __asm	pminub		third,				second			\
 		ins2(first, second, reg)

 #define	add4(first, second, third, fourth, reg)			\
 __asm	SSE_RMOVE	fourth,				reg				\
 __asm	pminub		fourth,				third			\
 		ins3(first, second, third, reg)

 #define	add5(first, second, third, fourth, fifth, reg)	\
 __asm	SSE_RMOVE	fifth,				reg				\
 __asm	pminub		fifth,				fourth			\
 		ins4(first, second, third, fourth, reg)

 #define	add6(first, second, third, fourth, fifth, sixth, reg)	\
 __asm	SSE_RMOVE	sixth,				reg				\
 __asm	pminub		sixth,				fifth			\
 		ins5(first, second, third, fourth, fifth, reg)

 #define	sub2(first, second, val)						\
 __asm	pmaxub		second,				val				\
 __asm	pminub		second,				first	

 #define	sub3(first, second, third, reg)					\
 __asm	pmaxub		third,				reg				\
 __asm	pminub		third,				second			\
 		sub2(first, second, reg)

 #define	sub4(first, second, third, fourth, reg)			\
 __asm	pmaxub		fourth,				reg				\
 __asm	pminub		fourth,				third			\
 		sub3(first, second, third, reg)

 #define	sub5(first, second, third, fourth, fifth, reg)	\
 __asm	pmaxub		fifth,				reg				\
 __asm	pminub		fifth,				fourth			\
 		sub4(first, second, third, fourth, reg)

 #define	sub6(first, second, third, fourth, fifth, sixth, reg)	\
 __asm	pmaxub		sixth,				reg				\
 __asm	pminub		sixth,				fifth			\
 		sub5(first, second, third, fourth, fifth, reg)

 #define	minmax1(min, max, val)							\
 __asm	pminub		min,				val				\
 __asm	pmaxub		max,				val

 #define	minmax2(max1, max2, min2, min1, reg)			\
 __asm	pminub		min2,				reg				\
 __asm	pmaxub		max2,				reg				\
 __asm	pmaxub		min2,				min1			\
 __asm	pminub		max2,				max1			\
 __asm	pminub		min1,				reg				\
 __asm	pmaxub		max1,				reg		


 #define	minmax3(max1, max2, max3, min3, min2, min1, reg)\
 __asm	pminub		min3,				reg				\
 __asm	pmaxub		max3,				reg				\
 __asm	pmaxub		min3,				min2			\
 __asm	pminub		max3,				max2			\
 		minmax2(max1, max2, min2, min1, reg)

 #define	minmax2sub(max1, max2, min2, min1, val)			\
 __asm	pminub		min2,				val				\
 __asm	pmaxub		max2,				val				\
 __asm	pmaxub		min2,				min1			\
 __asm	pminub		max2,				max1

 #define	minmax3sub(max1, max2, max3, min3, min2, min1, reg)\
 __asm	pminub		min3,				reg				\
 __asm	pmaxub		max3,				reg				\
 __asm	pmaxub		min3,				min2			\
 __asm	pminub		max3,				max2			\
 		minmax2sub(max1, max2, min2, min1, reg)

 #ifdef	SHARPEN
 void	SSE_RemoveGrain4(BYTE *dp, int dpitch, const BYTE *_sp, int spitch, int hblocks, int remainder, int incpitch, int height, int strength)
 #else
 void	SSE_RemoveGrain4(BYTE *dp, int dpitch, const BYTE *_sp, int spitch, int hblocks, int remainder, int incpitch, int height)
 #endif
 {
 __asm	mov			eax,				hblocks
 __asm	mov			ebx,				spitch
 #ifdef	MODIFYPLUGIN
 __asm	mov			ecx,				eax
 #endif
 __asm	mov			edx,				remainder
 #if		SSE_INCREMENT == 16
 __asm	add			eax,				eax
 #endif
 __asm	mov			esi,				_sp
 #ifdef	MODIFYPLUGIN
 __asm	lea			eax,				[eax * 8 + edx]
 #else
 __asm	lea			eax,				[eax * 8 + edx + SSE_INCREMENT + 1]
 #endif
 __asm	sub			esi,				ebx
 __asm	sub			dpitch,				eax
 __asm	neg			eax
 __asm	mov			edi,				dp
 #ifdef	MODIFYPLUGIN
 __asm	inc			edi
 __asm	lea			eax,				[ebx + eax]	
 #else
 __asm	lea			eax,				[ebx + eax + 1]	
 #ifdef	SHARPEN
 __asm	mov			spitch,				eax
 __asm	mov			eax,				strength
 #endif
 __asm	align		16
 __asm	column_loop:
 __asm	SSE3_MOVE	SSE0,				[esi]
 __asm	SSE3_MOVE	SSE7,				[esi + 1]
 		add2(SSE0, SSE1, SSE7)
 __asm	SSE3_MOVE	SSE6,				[esi + 2]
 __asm	SSE3_MOVE	SSE5,				[esi + ebx]
 		add3(SSE0, SSE1, SSE2, SSE6)
 __asm	SSE3_MOVE	SSE7,				[esi + ebx + 2]
 		add4(SSE0, SSE1, SSE2, SSE3, SSE5)
 __asm	movd		[edi],				SSE5
 __asm	SSE3_MOVE	SSE6,				[esi + 2*ebx]				
 		add5(SSE0, SSE1, SSE2, SSE3, SSE4, SSE7)
 __asm	SSE3_MOVE	SSE5,				[esi + 2*ebx + 1]				
 		sub5(SSE0, SSE1, SSE2, SSE3, SSE4, SSE6)
 __asm	SSE3_MOVE	SSE7,				[esi + 2*ebx + 2]				
 		sub4(SSE1, SSE2, SSE3, SSE4, SSE5)
 #if		(ISSE > 1) || defined(SHLUR)
 __asm	SSE3_MOVE	SSE5,				[esi + ebx + 1]
 #endif
 		sub3(SSE2, SSE3, SSE4, SSE7)
 #ifdef	SHLUR
 		sharpen(SSE5, SSE4, SSE3, rshift[eax], shift_mask[eax], SSE0, SSE1)
 __asm	SSE_MOVE	[edi + 1],			SSE5
 #else	// SHLUR
 #if		ISSE > 1
 __asm	pmaxub		SSE4,				SSE5
 #else
 __asm	pmaxub		SSE4,				[esi + ebx + 1]
 #endif
 __asm	pminub		SSE3,				SSE4
 __asm	SSE_MOVE	[edi + 1],			SSE3
 #endif	// SHLUR
 // now the pixels in the middle
 __asm	add			esi,				SSE_INCREMENT
 __asm	add			edi,				SSE_INCREMENT + 1
 __asm	mov			ecx,				hblocks
 #endif // MODIFYPLUGIN
 __asm	align		16
 __asm	middle_loop:
 __asm	SSE3_MOVE	SSE0,				[esi]
 __asm	SSE3_MOVE	SSE7,				[esi + 1]
 		add2(SSE0, SSE1, SSE7)
 __asm	SSE3_MOVE	SSE6,				[esi + 2]
 __asm	SSE3_MOVE	SSE5,				[esi + ebx]
 		add3(SSE0, SSE1, SSE2, SSE6)
 __asm	SSE3_MOVE	SSE7,				[esi + ebx + 2]
 		add4(SSE0, SSE1, SSE2, SSE3, SSE5)
 #if		MODIFYPLUGIN == 1
 __asm	SSE3_MOVE	SSE6,				[esi + ebx + 1]				
 		add5(SSE0, SSE1, SSE2, SSE3, SSE4, SSE7)
 __asm	SSE3_MOVE	SSE7,				[esi + 2*ebx]
 		add6(SSE0, SSE1, SSE2, SSE3, SSE4, SSE5, SSE6)
 __asm	SSE3_MOVE	SSE6,				[esi + 2*ebx + 1]
 		sub6(SSE0, SSE1, SSE2, SSE3, SSE4, SSE5, SSE7)
 __asm	SSE3_MOVE	SSE7,				[esi + 2*ebx + 2]
 		sub5(SSE1, SSE2, SSE3, SSE4, SSE5, SSE6)
 #if		ISSE > 1	
 __asm	SSE3_MOVE	SSE0,				[edi]
 #endif
 		sub4(SSE2, SSE3, SSE4, SSE5, SSE7)
 #if		ISSE > 1
 __asm	pmaxub		SSE5,				SSE0
 #else
 __asm	pmaxub		SSE5,				[edi]
 #endif
 __asm	add			esi,				SSE_INCREMENT
 __asm	pminub		SSE3,				SSE5
 __asm	SSE_MOVE	[edi],				SSE3
 #else	// MODIFYPLUGIN == 1
 __asm	SSE3_MOVE	SSE6,				[esi + 2*ebx]				
 		add5(SSE0, SSE1, SSE2, SSE3, SSE4, SSE7)
 __asm	SSE3_MOVE	SSE5,				[esi + 2*ebx + 1]				
 		sub5(SSE0, SSE1, SSE2, SSE3, SSE4, SSE6)
 __asm	SSE3_MOVE	SSE7,				[esi + 2*ebx + 2]				
 		sub4(SSE1, SSE2, SSE3, SSE4, SSE5)
 #if		(ISSE > 1) || defined(SHLUR)
 #ifdef	MODIFYPLUGIN
 __asm	SSE3_MOVE	SSE5,				[edi]
 #else
 __asm	SSE3_MOVE	SSE5,				[esi + ebx + 1]
 #endif
 #endif
 		sub3(SSE2, SSE3, SSE4, SSE7)
 #ifdef	SHLUR
 		sharpen(SSE5, SSE4, SSE3, rshift[eax], shift_mask[eax], SSE0, SSE1)
 __asm	add			esi,				SSE_INCREMENT
 __asm	SSE_MOVE	[edi],				SSE5
 #else
 #if		ISSE > 1
 __asm	pmaxub		SSE4,				SSE5
 #else
 #ifdef	MODIFYPLUGIN
 __asm	pmaxub		SSE4,				[edi]
 #else	// ISSE > 1
 __asm	pmaxub		SSE4,				[esi + ebx + 1]
 #endif
 #endif	// ISSE > 1
 __asm	add			esi,				SSE_INCREMENT
 __asm	pminub		SSE3,				SSE4
 __asm	SSE_MOVE	[edi],				SSE3
 #endif	// SHLUR
 #endif	//MODIFYPLUGIN == 1
 __asm	add			edi,				SSE_INCREMENT
 __asm	dec			ecx
 __asm	jnz			middle_loop
 // the last pixels
 __asm	add			esi,				edx
 __asm	add			edi,				edx
 __asm	SSE3_MOVE	SSE0,				[esi]
 __asm	SSE3_MOVE	SSE7,				[esi + 1]
 		add2(SSE0, SSE1, SSE7)
 __asm	SSE3_MOVE	SSE6,				[esi + 2]
 __asm	SSE3_MOVE	SSE5,				[esi + ebx]
 		add3(SSE0, SSE1, SSE2, SSE6)
 __asm	SSE3_MOVE	SSE7,				[esi + ebx + 2]
 		add4(SSE0, SSE1, SSE2, SSE3, SSE5)
 #ifndef		MODIFYPLUGIN
 __asm	SSE_MOVE	[edi + 1],			SSE7
 #endif
 #if		MODIFYPLUGIN == 1
 __asm	SSE3_MOVE	SSE6,				[esi + ebx + 1]				
 		add5(SSE0, SSE1, SSE2, SSE3, SSE4, SSE7)
 __asm	SSE3_MOVE	SSE7,				[esi + 2*ebx]
 		add6(SSE0, SSE1, SSE2, SSE3, SSE4, SSE5, SSE6)
 __asm	SSE3_MOVE	SSE6,				[esi + 2*ebx + 1]
 		sub6(SSE0, SSE1, SSE2, SSE3, SSE4, SSE5, SSE7)
 __asm	SSE3_MOVE	SSE7,				[esi + 2*ebx + 2]
 		sub5(SSE1, SSE2, SSE3, SSE4, SSE5, SSE6)
 #if		ISSE > 1	
 __asm	SSE3_MOVE	SSE0,				[edi]
 #endif
 		sub4(SSE2, SSE3, SSE4, SSE5, SSE7)
 #if		ISSE > 1
 __asm	pmaxub		SSE5,				SSE0
 #else
 __asm	pmaxub		SSE5,				[edi]
 #endif
 __asm	pminub		SSE3,				SSE5
 __asm	add			esi,				eax
 __asm	SSE_MOVE	[edi],				SSE3
 #else	// MODIFYPLUGIN == 1
 __asm	SSE3_MOVE	SSE6,				[esi + 2*ebx]				
 		add5(SSE0, SSE1, SSE2, SSE3, SSE4, SSE7)
 __asm	SSE3_MOVE	SSE5,				[esi + 2*ebx + 1]				
 		sub5(SSE0, SSE1, SSE2, SSE3, SSE4, SSE6)
 __asm	SSE3_MOVE	SSE7,				[esi + 2*ebx + 2]				
 		sub4(SSE1, SSE2, SSE3, SSE4, SSE5)
 #if		(ISSE > 1) || defined(SHLUR)
 #ifdef	MODIFYPLUGIN
 __asm	SSE3_MOVE	SSE5,				[edi]
 #else
 __asm	SSE3_MOVE	SSE5,				[esi + ebx + 1]
 #endif
 #endif
 		sub3(SSE2, SSE3, SSE4, SSE7)
 #ifdef	SHLUR
 		sharpen(SSE5, SSE4, SSE3, rshift[eax], shift_mask[eax], SSE0, SSE1)
 #ifdef	SHARPEN
 __asm	add			esi,				spitch
 #else
 __asm	add			esi,				eax
 #endif
 __asm	SSE_MOVE	[edi],				SSE5
 #else
 #if		ISSE > 1
 __asm	pmaxub		SSE4,				SSE5
 #else
 #ifdef	MODIFYPLUGIN
 __asm	pmaxub		SSE4,				[edi]
 #else	// ISSE > 1
 __asm	pmaxub		SSE4,				[esi + ebx + 1]
 #endif
 #endif	// ISSE > 1
 __asm	pminub		SSE3,				SSE4
 __asm	add			esi,				eax
 __asm	SSE_MOVE	[edi],				SSE3
 #endif	// SHLUR
 #endif	//MODIFYPLUGIN == 1
 __asm	add			edi,				dpitch
 __asm	dec			height
 #ifdef	MODIFYPLUGIN
 __asm	mov			ecx,				hblocks
 __asm	jnz			middle_loop
 #else
 __asm	jnz			column_loop
 #endif
 }

 #ifdef	SHARPEN
 void	SSE_RemoveGrain1(BYTE *dp, int dpitch, const BYTE *_sp, int spitch, int hblocks, int remainder, int incpitch, int height, int strength)
 #else
 void	SSE_RemoveGrain1(BYTE *dp, int dpitch, const BYTE *_sp, int spitch, int hblocks, int remainder, int incpitch, int height)
 #endif
 {
 #ifdef	SHARPEN
 __asm	mov			ebx,				strength
 __asm	SSE_RMOVE	SSE2,				rshift[ebx]
 __asm	SSE_RMOVE	SSE3,				shift_mask[ebx]
 #endif
 __asm	mov			eax,				hblocks
 __asm	mov			ebx,				spitch
 #ifdef	MODIFYPLUGIN
 __asm	mov			ecx,				eax
 #endif
 __asm	mov			edx,				remainder
 #if		SSE_INCREMENT == 16
 __asm	add			eax,				eax
 #endif
 __asm	mov			esi,				_sp
 #ifdef	MODIFYPLUGIN
 __asm	lea			eax,				[eax * 8 + edx]
 #else
 __asm	lea			eax,				[eax * 8 + edx + SSE_INCREMENT + 1]
 #endif
 __asm	sub			esi,				ebx
 __asm	sub			dpitch,				eax
 __asm	neg			eax
 __asm	mov			edi,				dp
 #ifdef	MODIFYPLUGIN
 __asm	inc			edi
 __asm	lea			eax,				[ebx + eax]	
 #else
 __asm	lea			eax,				[ebx + eax + 1]	
 __asm	align		16
 __asm	column_loop:
 __asm	SSE3_MOVE	SSE0,				[esi]
 __asm	SSE3_MOVE	SSE5,				[esi + 1]
 __asm	SSE_RMOVE	SSE1,				SSE0
 __asm	SSE3_MOVE	SSE4,				[esi + 2]
 		minmax1(SSE0, SSE1, SSE5)
 __asm	SSE3_MOVE	SSE5,				[esi + 2*ebx]
 		minmax1(SSE0, SSE1, SSE4)
 __asm	SSE3_MOVE	SSE4,				[esi + 2*ebx + 1]
 		minmax1(SSE0, SSE1, SSE5)
 __asm	SSE3_MOVE	SSE5,				[esi + 2*ebx + 2]
 		minmax1(SSE0, SSE1, SSE4)
 __asm	SSE3_MOVE	SSE4,				[esi + ebx]
 		minmax1(SSE0, SSE1, SSE5)
 __asm	SSE3_MOVE	SSE5,				[esi + ebx + 2]
 		minmax1(SSE0, SSE1, SSE4)
 #if		(ISSE > 1) || defined(SHLUR)
 __asm	SSE3_MOVE	SSE7,				[esi + ebx + 1]
 #endif
 __asm	movd		[edi],				SSE4		// only for saving the first byte
 		minmax1(SSE0, SSE1, SSE5)
 #ifdef	SHLUR
 		sharpen(SSE7, SSE0, SSE1, SSE2, SSE3, SSE5, SSE4)
 __asm	SSE_MOVE	[edi + 1],			SSE7
 #else
 #if	ISSE > 1
 __asm	pmaxub		SSE0,				SSE7
 #else
 __asm	pmaxub		SSE0,				[esi + ebx + 1]
 #endif
 __asm	pminub		SSE0,				SSE1
 __asm	SSE_MOVE	[edi + 1],			SSE0
 #endif	// SHLUR
 // now the pixels in the middle
 __asm	add			esi,				SSE_INCREMENT
 __asm	add			edi,				SSE_INCREMENT + 1
 __asm	mov			ecx,				hblocks
 #endif	// MODIFYPLUGIN
 __asm	align		16
 __asm	middle_loop:
 __asm	SSE3_MOVE	SSE0,				[esi]
 __asm	SSE3_MOVE	SSE5,				[esi + 1]
 __asm	SSE_RMOVE	SSE1,				SSE0
 __asm	SSE3_MOVE	SSE4,				[esi + 2]
 		minmax1(SSE0, SSE1, SSE5)
 __asm	SSE3_MOVE	SSE6,				[esi + 2*ebx]
 		minmax1(SSE0, SSE1, SSE4)
 __asm	SSE3_MOVE	SSE5,				[esi + 2*ebx + 1]
 		minmax1(SSE0, SSE1, SSE6)
 __asm	SSE3_MOVE	SSE4,				[esi + 2*ebx + 2]
 		minmax1(SSE0, SSE1, SSE5)
 __asm	SSE3_MOVE	SSE6,				[esi + ebx]
 		minmax1(SSE0, SSE1, SSE4)
 #if	MODIFYPLUGIN == 1
 __asm	SSE3_MOVE	SSE5,				[esi + ebx + 1]
 		minmax1(SSE0, SSE1, SSE6)
 __asm	SSE3_MOVE	SSE4,				[esi + ebx + 2]
 		minmax1(SSE0, SSE1, SSE5)
 #if		ISSE > 1
 __asm	SSE3_MOVE	SSE7,				[edi]			
 #endif
 		minmax1(SSE0, SSE1, SSE4)
 #else	// MODIFYPLUGIN
 __asm	SSE3_MOVE	SSE5,				[esi + ebx + 2]
 		minmax1(SSE0, SSE1, SSE6)
 #if		(ISSE > 1) || defined(SHLUR)
 __asm	SSE3_MOVE	SSE7,				[esi + ebx + 1]
 #endif
 		minmax1(SSE0, SSE1, SSE5)
 #endif	// MODIFYPLUGIN
 #ifdef	SHLUR
 		sharpen(SSE7, SSE0, SSE1, SSE2, SSE3, SSE5, SSE4)
 __asm	add			esi,				SSE_INCREMENT
 __asm	SSE_MOVE	[edi],				SSE7
 #else
 #if		ISSE > 1
 __asm	pmaxub		SSE0,					SSE7
 #else
 #ifdef	MODIFYPLUGIN
 __asm	pmaxub	SSE0,					[edi]
 #else
 __asm	pmaxub		SSE0,				[esi + ebx + 1]
 #endif
 #endif	// ISSE > 1
 __asm	pminub		SSE0,				SSE1
 __asm	add			esi,				SSE_INCREMENT
 __asm	SSE_MOVE	[edi],				SSE0
 #endif	// SHLUR
 __asm	add			edi,				SSE_INCREMENT
 #if	((MODIFYPLUGIN == 1) && (ISSE > 1)) || defined(SHLUR)
 __asm	dec			ecx
 __asm	jnz			middle_loop
 #else
 __asm	loop		middle_loop
 #endif
 // the last pixels
 __asm	add			esi,				edx
 __asm	add			edi,				edx
 __asm	SSE3_MOVE	SSE0,				[esi]
 __asm	SSE3_MOVE	SSE5,				[esi + 1]
 __asm	SSE_RMOVE	SSE1,				SSE0
 __asm	SSE3_MOVE	SSE4,				[esi + 2]
 		minmax1(SSE0, SSE1, SSE5)
 __asm	SSE3_MOVE	SSE6,				[esi + 2*ebx]
 		minmax1(SSE0, SSE1, SSE4)
 __asm	SSE3_MOVE	SSE5,				[esi + 2*ebx + 1]
 		minmax1(SSE0, SSE1, SSE6)
 __asm	SSE3_MOVE	SSE4,				[esi + 2*ebx + 2]
 		minmax1(SSE0, SSE1, SSE5)
 __asm	SSE3_MOVE	SSE6,				[esi + ebx]
 		minmax1(SSE0, SSE1, SSE4)
 #if	MODIFYPLUGIN == 1
 __asm	SSE3_MOVE	SSE5,				[esi + ebx + 1]
 		minmax1(SSE0, SSE1, SSE6)
 __asm	SSE3_MOVE	SSE4,				[esi + ebx + 2]
 		minmax1(SSE0, SSE1, SSE5)
 #if		ISSE > 1
 __asm	SSE3_MOVE	SSE7,				[edi]			
 #endif
 		minmax1(SSE0, SSE1, SSE4)
 #else	// MODIFYPLUGIN == 1
 __asm	SSE3_MOVE	SSE5,				[esi + ebx + 2]
 		minmax1(SSE0, SSE1, SSE6)
 #if		(ISSE > 1) || defined(SHLUR)
 __asm	SSE3_MOVE	SSE7,				[esi + ebx + 1]
 #endif
 #ifndef	MODIFYPLUGIN
 __asm	SSE_MOVE	[edi + 1],			SSE5		// only for saving the last byte
 #endif
 		minmax1(SSE0, SSE1, SSE5)
 #endif	// MODIFYPLUGIN == 1
 #ifdef	SHLUR
 		sharpen(SSE7, SSE0, SSE1, SSE2, SSE3, SSE5, SSE4)
 __asm	add			esi,				eax
 __asm	SSE_MOVE	[edi],				SSE7
 #else
 #if		ISSE > 1
 __asm	pmaxub		SSE0,				SSE7
 #else
 #ifdef	MODIFYPLUGIN
 __asm	pmaxub		SSE0,				[edi]
 #else
 __asm	pmaxub		SSE0,				[esi + ebx + 1]
 #endif
 #endif	// ISSE > 1
 __asm	pminub		SSE0,				SSE1
 __asm	add			esi,				eax
 __asm	SSE_MOVE	[edi],				SSE0
 #endif	// SHLUR
 __asm	add			edi,				dpitch
 __asm	dec			height
 #ifdef	MODIFYPLUGIN
 __asm	mov			ecx,				hblocks
 __asm	jnz			middle_loop
 #else
 __asm	jnz			column_loop
 #endif
 }

 #ifdef	SHARPEN
 void	SSE_RemoveGrain2(BYTE *dp, int dpitch, const BYTE *_sp, int spitch, int hblocks, int remainder, int incpitch, int height, int strength)
 #else
 void	SSE_RemoveGrain2(BYTE *dp, int dpitch, const BYTE *_sp, int spitch, int hblocks, int remainder, int incpitch, int height)
 #endif
 {
 __asm	mov			eax,				hblocks
 __asm	mov			ebx,				spitch
 #ifdef	MODIFYPLUGIN
 __asm	mov			ecx,				eax
 #endif
 __asm	mov			edx,				remainder
 #if		SSE_INCREMENT == 16
 __asm	add			eax,				eax
 #endif
 __asm	mov			esi,				_sp
 #ifdef	MODIFYPLUGIN
 __asm	lea			eax,				[eax * 8 + edx]
 #else
 __asm	lea			eax,				[eax * 8 + edx + SSE_INCREMENT + 1]
 #endif
 __asm	sub			esi,				ebx
 __asm	sub			dpitch,				eax
 __asm	neg			eax
 __asm	mov			edi,				dp
 #ifdef	MODIFYPLUGIN
 __asm	inc			edi
 __asm	lea			eax,				[ebx + eax]	
 #else
 __asm	lea			eax,				[ebx + eax + 1]	
 #ifdef	SHARPEN
 __asm	mov			spitch,				eax
 __asm	mov			eax,				strength
 #endif
 __asm	align		16
 __asm	column_loop:
 __asm	SSE3_MOVE	SSE0,				[esi]
 __asm	SSE3_MOVE	SSE7,				[esi + 1]
 		add2(SSE0, SSE1, SSE7)
 __asm	SSE3_MOVE	SSE6,				[esi + 2]
 __asm	SSE3_MOVE	SSE7,				[esi + ebx]
 		add3(SSE0, SSE1, SSE2, SSE6)
 __asm	movd		[edi],				SSE7
 __asm	SSE3_MOVE	SSE6,				[esi + ebx + 2]
 		add4(SSE0, SSE1, SSE2, SSE3, SSE7)
 __asm	SSE3_MOVE	SSE5,				[esi + 2*ebx]
 		minmax2(SSE0, SSE1, SSE2, SSE3, SSE6)
 __asm	SSE3_MOVE	SSE7,				[esi + 2*ebx + 1]
 		minmax2(SSE0, SSE1, SSE2, SSE3, SSE5)
 __asm	SSE3_MOVE	SSE6,				[esi + 2*ebx + 2]
 		minmax2(SSE0, SSE1, SSE2, SSE3, SSE7)
 #if		(ISSE > 1) || defined(SHLUR)
 __asm	SSE3_MOVE	SSE5,				[esi + ebx + 1]				
 #endif
 		minmax2sub(SSE0, SSE1, SSE2, SSE3, SSE6)
 #ifdef	SHLUR
 		sharpen(SSE5, SSE2, SSE1, rshift[eax], shift_mask[eax], SSE6, SSE7)
 __asm	SSE_MOVE	[edi + 1],			SSE5
 #else
 #if		ISSE > 1
 __asm	pmaxub		SSE2,				SSE5
 #else
 __asm	pmaxub		SSE2,				[esi + ebx + 1]	
 #endif
 __asm	pminub		SSE1,				SSE2
 __asm	SSE_MOVE	[edi + 1],			SSE1
 #endif	// SHLUR
 // now the pixels in the middle
 __asm	add			esi,				SSE_INCREMENT
 __asm	add			edi,				SSE_INCREMENT + 1
 __asm	mov			ecx,				hblocks
 #endif // MODIFYPLUGIN
 __asm	align		16
 __asm	middle_loop:
 __asm	SSE3_MOVE	SSE0,				[esi]
 __asm	SSE3_MOVE	SSE7,				[esi + 1]
 		add2(SSE0, SSE1, SSE7)
 __asm	SSE3_MOVE	SSE6,				[esi + 2]
 __asm	SSE3_MOVE	SSE7,				[esi + ebx]
 		add3(SSE0, SSE1, SSE2, SSE6)
 #if	MODIFYPLUGIN == 1
 __asm	SSE3_MOVE	SSE4,				[esi + ebx + 1]
 #else
 __asm	SSE3_MOVE	SSE6,				[esi + ebx + 2]
 #endif
 		add4(SSE0, SSE1, SSE2, SSE3, SSE7)
 #if	MODIFYPLUGIN == 1
 __asm	SSE3_MOVE	SSE6,				[esi + ebx + 2]
 		minmax2(SSE0, SSE1, SSE2, SSE3, SSE4)
 #endif
 __asm	SSE3_MOVE	SSE5,				[esi + 2*ebx]
 		minmax2(SSE0, SSE1, SSE2, SSE3, SSE6)
 __asm	SSE3_MOVE	SSE7,				[esi + 2*ebx + 1]
 		minmax2(SSE0, SSE1, SSE2, SSE3, SSE5)
 __asm	SSE3_MOVE	SSE6,				[esi + 2*ebx + 2]
 		minmax2(SSE0, SSE1, SSE2, SSE3, SSE7)
 #if		(ISSE > 1) || defined(SHLUR)
 #ifdef	MODIFYPLUGIN
 __asm	SSE3_MOVE	SSE5,				[edi]
 #else
 __asm	SSE3_MOVE	SSE5,				[esi + ebx + 1]	
 #endif
 #endif
 		minmax2sub(SSE0, SSE1, SSE2, SSE3, SSE6)
 #ifdef	SHLUR
 		sharpen(SSE5, SSE2, SSE1, rshift[eax], shift_mask[eax], SSE6, SSE7)
 __asm	add			esi,				SSE_INCREMENT
 __asm	SSE_MOVE	[edi],				SSE5
 #else
 #if		ISSE > 1
 __asm	pmaxub		SSE2,				SSE5
 #else
 #ifdef	MODIFYPLUGIN
 __asm	pmaxub		SSE2,				[edi]
 #else
 __asm	pmaxub		SSE2,				[esi + ebx + 1]	
 #endif
 #endif	// ISSE > 1
 __asm	pminub		SSE1,				SSE2
 __asm	add			esi,				SSE_INCREMENT
 __asm	SSE_MOVE	[edi],				SSE1
 #endif	// SHLUR
 __asm	add			edi,				SSE_INCREMENT
 __asm	dec			ecx
 __asm	jnz			middle_loop
 // the last pixels
 __asm	add			esi,				edx
 __asm	add			edi,				edx
 __asm	SSE3_MOVE	SSE0,				[esi]
 __asm	SSE3_MOVE	SSE7,				[esi + 1]
 		add2(SSE0, SSE1, SSE7)
 __asm	SSE3_MOVE	SSE6,				[esi + 2]
 __asm	SSE3_MOVE	SSE7,				[esi + ebx]
 		add3(SSE0, SSE1, SSE2, SSE6)
 #if	MODIFYPLUGIN == 1
 __asm	SSE3_MOVE	SSE4,				[esi + ebx + 1]
 #else
 __asm	SSE3_MOVE	SSE6,				[esi + ebx + 2]
 #endif
 		add4(SSE0, SSE1, SSE2, SSE3, SSE7)
 #if	MODIFYPLUGIN == 1
 __asm	SSE3_MOVE	SSE6,				[esi + ebx + 2]
 		minmax2(SSE0, SSE1, SSE2, SSE3, SSE4)
 #endif
 #ifndef	MODIFYPLUGIN
 __asm	SSE_MOVE	[edi + 1],			SSE6
 #endif
 __asm	SSE3_MOVE	SSE5,				[esi + 2*ebx]
 		minmax2(SSE0, SSE1, SSE2, SSE3, SSE6)
 __asm	SSE3_MOVE	SSE7,				[esi + 2*ebx + 1]
 		minmax2(SSE0, SSE1, SSE2, SSE3, SSE5)
 __asm	SSE3_MOVE	SSE6,				[esi + 2*ebx + 2]
 		minmax2(SSE0, SSE1, SSE2, SSE3, SSE7)
 #if		(ISSE > 1) || defined(SHLUR)
 #ifdef	MODIFYPLUGIN
 __asm	SSE3_MOVE	SSE5,				[edi]
 #else
 __asm	SSE3_MOVE	SSE5,				[esi + ebx + 1]	
 #endif
 #endif
 		minmax2sub(SSE0, SSE1, SSE2, SSE3, SSE6)
 #ifdef	SHLUR
 		sharpen(SSE5, SSE2, SSE1, rshift[eax], shift_mask[eax], SSE6, SSE7)
 #ifdef	SHARPEN
 __asm	add			esi,				spitch
 #else
 __asm	add			esi,				eax
 #endif
 __asm	SSE_MOVE	[edi],				SSE5
 #else
 #if		ISSE > 1
 __asm	pmaxub		SSE2,				SSE5
 #else
 #ifdef	MODIFYPLUGIN
 __asm	pmaxub		SSE2,				[edi]
 #else
 __asm	pmaxub		SSE2,				[esi + ebx + 1]	
 #endif
 #endif	// ISSE > 1
 __asm	pminub		SSE1,				SSE2
 __asm	add			esi,				eax
 __asm	SSE_MOVE	[edi],				SSE1
 #endif	// SHLUR
 __asm	add			edi,				dpitch
 __asm	dec			height
 #ifdef	MODIFYPLUGIN
 __asm	mov			ecx,				hblocks
 __asm	jnz			middle_loop
 #else
 __asm	jnz			column_loop
 #endif
 }

 #ifdef	SHARPEN
 void	SSE_RemoveGrain3(BYTE *dp, int dpitch, const BYTE *_sp, int spitch, int hblocks, int remainder, int incpitch, int height, int strength)
 #else
 void	SSE_RemoveGrain3(BYTE *dp, int dpitch, const BYTE *_sp, int spitch, int hblocks, int remainder, int incpitch, int height)
 #endif
 { 
 __asm	mov			eax,				hblocks
 __asm	mov			ebx,				spitch
 #ifdef	MODIFYPLUGIN
 __asm	mov			ecx,				eax
 #endif
 __asm	mov			edx,				remainder
 #if		SSE_INCREMENT == 16
 __asm	add			eax,				eax
 #endif
 __asm	mov			esi,				_sp
 #ifdef	MODIFYPLUGIN
 __asm	lea			eax,				[eax * 8 + edx]
 #else
 __asm	lea			eax,				[eax * 8 + edx + SSE_INCREMENT + 1]
 #endif
 __asm	sub			esi,				ebx
 __asm	sub			dpitch,				eax
 __asm	neg			eax
 __asm	mov			edi,				dp
 #ifdef	MODIFYPLUGIN
 __asm	inc			edi
 __asm	lea			eax,				[ebx + eax]	
 #else
 __asm	lea			eax,				[ebx + eax + 1]	
 #ifdef	SHARPEN
 __asm	mov			spitch,				eax
 __asm	mov			eax,				strength
 #endif
 __asm	align		16
 __asm	column_loop:
 __asm	SSE3_MOVE	SSE0,				[esi]
 __asm	SSE3_MOVE	SSE7,				[esi + 1]
 		add2(SSE0, SSE1, SSE7)
 __asm	SSE3_MOVE	SSE6,				[esi + 2]
 __asm	SSE3_MOVE	SSE5,				[esi + ebx]
 		add3(SSE0, SSE1, SSE2, SSE6)
 __asm	SSE3_MOVE	SSE7,				[esi + ebx + 2]
 		add4(SSE0, SSE1, SSE2, SSE3, SSE5)
 __asm	movd		[edi],				SSE5
 __asm	SSE3_MOVE	SSE6,				[esi + 2*ebx]				
 		add5(SSE0, SSE1, SSE2, SSE3, SSE4, SSE7)
 __asm	SSE3_MOVE	SSE7,				[esi + 2*ebx + 1]
 		add6(SSE0, SSE1, SSE2, SSE3, SSE4, SSE5, SSE6)
 __asm	SSE3_MOVE	SSE6,				[esi + 2*ebx + 2]
 		minmax3sub(SSE0, SSE1, SSE2, SSE3, SSE4, SSE5, SSE7)
 #if		(ISSE > 1) || defined(SHLUR)
 __asm	SSE3_MOVE	SSE7,				[esi + ebx + 1]
 #endif
 		minmax2sub(SSE1, SSE2, SSE3, SSE4, SSE6)
 #ifdef	SHLUR
 		sharpen(SSE7, SSE3, SSE2, rshift[eax], shift_mask[eax], SSE0, SSE5)
 __asm	SSE_MOVE	[edi + 1],			SSE7
 #else
 #if		ISSE > 1
 __asm	pmaxub		SSE3,				SSE7
 #else
 __asm	pmaxub		SSE3,				[esi + ebx + 1]
 #endif
 __asm	pminub		SSE3,				SSE2
 __asm	SSE_MOVE	[edi + 1],			SSE3
 #endif	// SHLUR
 // now the pixels in the middle
 __asm	add			esi,				SSE_INCREMENT
 __asm	add			edi,				SSE_INCREMENT + 1
 __asm	mov			ecx,				hblocks
 #endif // MODIFYPLUGIN
 __asm	align		16
 __asm	middle_loop:
 __asm	SSE3_MOVE	SSE0,				[esi]
 __asm	SSE3_MOVE	SSE7,				[esi + 1]
 		add2(SSE0, SSE1, SSE7)
 __asm	SSE3_MOVE	SSE6,				[esi + 2]
 __asm	SSE3_MOVE	SSE5,				[esi + ebx]
 		add3(SSE0, SSE1, SSE2, SSE6)
 __asm	SSE3_MOVE	SSE7,				[esi + ebx + 2]
 		add4(SSE0, SSE1, SSE2, SSE3, SSE5)
 #if		MODIFYPLUGIN == 1
 __asm	SSE3_MOVE	SSE6,				[esi + ebx + 1]				
 		add5(SSE0, SSE1, SSE2, SSE3, SSE4, SSE7)
 __asm	SSE3_MOVE	SSE7,				[esi + 2*ebx]
 		add6(SSE0, SSE1, SSE2, SSE3, SSE4, SSE5, SSE6)
 __asm	SSE3_MOVE	SSE6,				[esi + 2*ebx + 1]
 		minmax3(SSE0, SSE1, SSE2, SSE3, SSE4, SSE5, SSE7)
 __asm	SSE3_MOVE	SSE7,				[esi + 2*ebx + 2]
 		minmax3sub(SSE0, SSE1, SSE2, SSE3, SSE4, SSE5, SSE6)
 #if		ISSE > 1
 __asm	SSE3_MOVE	SSE6,				[edi]
 #endif
 		minmax2sub(SSE1, SSE2, SSE3, SSE4, SSE7)
 #if		ISSE > 1
 __asm	pmaxub		SSE3,				SSE6
 #else
 __asm	pmaxub		SSE3,				[edi]
 #endif
 __asm	pminub		SSE3,				SSE2
 __asm	add			esi,				SSE_INCREMENT
 __asm	SSE_MOVE	[edi],				SSE3
 #else	// MODIFYPLUGIN == 1
 __asm	SSE3_MOVE	SSE6,				[esi + 2*ebx]				
 		add5(SSE0, SSE1, SSE2, SSE3, SSE4, SSE7)
 __asm	SSE3_MOVE	SSE7,				[esi + 2*ebx + 1]
 		add6(SSE0, SSE1, SSE2, SSE3, SSE4, SSE5, SSE6)
 __asm	SSE3_MOVE	SSE6,				[esi + 2*ebx + 2]
 		minmax3sub(SSE0, SSE1, SSE2, SSE3, SSE4, SSE5, SSE7)
 #if		(ISSE > 1) || defined(SHLUR)
 #ifdef		MODIFYPLUGIN
 __asm	SSE3_MOVE	SSE7,				[edi]
 #else
 __asm	SSE3_MOVE	SSE7,				[esi + ebx + 1]
 #endif
 #endif
 		minmax2sub(SSE1, SSE2, SSE3, SSE4, SSE6)
 #ifdef	SHLUR
 		sharpen(SSE7, SSE3, SSE2, rshift[eax], shift_mask[eax], SSE0, SSE5)
 __asm	add			esi,				SSE_INCREMENT
 __asm	SSE_MOVE	[edi],				SSE7
 #else
 #if		ISSE > 1
 __asm	pmaxub		SSE3,				SSE7
 #else
 #ifdef		MODIFYPLUGIN
 __asm	pmaxub		SSE3,				[edi]
 #else
 __asm	pmaxub		SSE3,				[esi + ebx + 1]
 #endif
 #endif	// ISSE > 1
 __asm	pminub		SSE3,				SSE2
 __asm	add			esi,				SSE_INCREMENT
 __asm	SSE_MOVE	[edi],				SSE3
 #endif	// SHLUR
 #endif	// MODIFYPLUGIN == 1	
 __asm	add			edi,				SSE_INCREMENT
 __asm	dec			ecx
 __asm	jnz			middle_loop
 // the last pixels
 __asm	add			esi,				edx
 __asm	add			edi,				edx
 __asm	SSE3_MOVE	SSE0,				[esi]
 __asm	SSE3_MOVE	SSE7,				[esi + 1]
 		add2(SSE0, SSE1, SSE7)
 __asm	SSE3_MOVE	SSE6,				[esi + 2]
 __asm	SSE3_MOVE	SSE5,				[esi + ebx]
 		add3(SSE0, SSE1, SSE2, SSE6)
 __asm	SSE3_MOVE	SSE7,				[esi + ebx + 2]
 		add4(SSE0, SSE1, SSE2, SSE3, SSE5)
 #if		MODIFYPLUGIN == 1
 __asm	SSE3_MOVE	SSE6,				[esi + ebx + 1]				
 		add5(SSE0, SSE1, SSE2, SSE3, SSE4, SSE7)
 __asm	SSE3_MOVE	SSE7,				[esi + 2*ebx]
 		add6(SSE0, SSE1, SSE2, SSE3, SSE4, SSE5, SSE6)
 __asm	SSE3_MOVE	SSE6,				[esi + 2*ebx + 1]
 		minmax3(SSE0, SSE1, SSE2, SSE3, SSE4, SSE5, SSE7)
 __asm	SSE3_MOVE	SSE7,				[esi + 2*ebx + 2]
 		minmax3sub(SSE0, SSE1, SSE2, SSE3, SSE4, SSE5, SSE6)
 #if		ISSE > 1
 __asm	SSE3_MOVE	SSE6,				[edi]
 #endif
 		minmax2sub(SSE1, SSE2, SSE3, SSE4, SSE7)
 #if		ISSE > 1
 __asm	pmaxub		SSE3,				SSE6
 #else
 __asm	pmaxub		SSE3,				[edi]
 #endif
 __asm	pminub		SSE3,				SSE2
 __asm	add			esi,				eax
 __asm	SSE_MOVE	[edi],				SSE3
 #else	// MODIFYPLUGIN == 1
 #ifndef	MODIFYPLUGIN
 __asm	SSE_MOVE	[edi + 1],			SSE7
 #endif
 __asm	SSE3_MOVE	SSE6,				[esi + 2*ebx]				
 		add5(SSE0, SSE1, SSE2, SSE3, SSE4, SSE7)
 __asm	SSE3_MOVE	SSE7,				[esi + 2*ebx + 1]
 		add6(SSE0, SSE1, SSE2, SSE3, SSE4, SSE5, SSE6)
 __asm	SSE3_MOVE	SSE6,				[esi + 2*ebx + 2]
 		minmax3sub(SSE0, SSE1, SSE2, SSE3, SSE4, SSE5, SSE7)
 #if		(ISSE > 1) || defined(SHLUR)
 #ifdef		MODIFYPLUGIN
 __asm	SSE3_MOVE	SSE7,				[edi]
 #else
 __asm	SSE3_MOVE	SSE7,				[esi + ebx + 1]
 #endif
 #endif	// ISSE > 1
 		minmax2sub(SSE1, SSE2, SSE3, SSE4, SSE6)
 #ifdef	SHLUR
 		sharpen(SSE7, SSE3, SSE2, rshift[eax], shift_mask[eax], SSE0, SSE5)
 #ifdef	SHARPEN
 __asm	add			esi,				spitch
 #else
 __asm	add			esi,				eax
 #endif
 __asm	SSE_MOVE	[edi],				SSE7
 #else
 #if		ISSE > 1
 __asm	pmaxub		SSE3,				SSE7
 #else
 #ifdef		MODIFYPLUGIN
 __asm	pmaxub		SSE3,				[edi]
 #else
 __asm	pmaxub		SSE3,				[esi + ebx + 1]
 #endif
 #endif	// ISSE > 1
 __asm	pminub		SSE3,				SSE2
 __asm	add			esi,				eax
 __asm	SSE_MOVE	[edi],				SSE3
 #endif	// SHLUR
 #endif	// MODIFYPLUGIN == 1
 __asm	add			edi,				dpitch
 __asm	dec			height
 #ifdef	MODIFYPLUGIN
 __asm	mov			ecx,				hblocks
 __asm	jnz			middle_loop
 #else
 __asm	jnz			column_loop
 #endif
 }


 // if( weight2[i] <= weight1[i] ) { value1[i] = value2[i]; weight1[i] = weight2[i]; }
 // value2 remains unchanged
 // weight2 must be a SSE register, value1, value2, weight1 may very well be a memory variables
 // but value1 and weight1 should be registers because they are used twice
 #define	mergeweighted(value1, weight1, value2, weight2) \
 __asm	pminub	weight1, weight2 \
 __asm	pcmpeqb	weight2, weight1 \
 __asm	psubusb	value1, weight2 \
 __asm	pand	weight2, value2 \
 __asm	por		value1, weight2 

 #define	merge2weighted(val1, val2, weight1, val1b, val2b, weight2) \
 __asm	pminub	weight1,	weight2		\
 __asm	pcmpeqb	weight2,	weight1		\
 __asm	psubusb	val1,		weight2		\
 __asm	psubusb	val2,		weight2		\
 __asm	pand	val1b,		weight2		\
 __asm	pand	val2b,		weight2		\
 __asm	por		val1,		val1b		\
 __asm	por		val2,		val2b

 #ifndef	SHLUR

 #if	MODIFYPLUGIN > 0
 #define		diagweight5(oldp, newp, weight, center, bound1, bound2, reg1, reg2)	\
 __asm	SSE3_MOVE	newp,				bound1			\
 __asm	SSE3_MOVE	reg1,				bound2			\
 __asm	SSE_RMOVE	weight,				newp			\
 __asm	SSE_RMOVE	reg2,				oldp			\
 __asm	pmaxub		newp,				reg1			\
 __asm	pminub		weight,				reg1			\
 __asm	pmaxub		newp,				center			\
 __asm	pminub		reg1,				center			\
 __asm	psubusb		reg2,				newp			\
 __asm	pminub		newp,				oldp			\
 __asm	pmaxub		newp,				weight			\
 __asm	psubusb		weight,				oldp			\
 __asm	pmaxub		weight,				reg2
 #else
 // the values bound1 and bound2 are loaded into SSE registers
 // then oldp is clipped with min(bound1, bound2) and max(bound1, bound2)
 // finally weight = |oldp - newp|
 // oldp is left unchanged
 #define		diagweight5(oldp, newp, weight, center, bound1, bound2, reg1, reg2)	\
 __asm	SSE3_MOVE	newp,				bound1			\
 __asm	SSE3_MOVE	reg1,				bound2			\
 __asm	SSE_RMOVE	weight,				newp			\
 __asm	SSE_RMOVE	reg2,				oldp			\
 __asm	pmaxub		newp,				reg1			\
 __asm	pminub		weight,				reg1			\
 __asm	psubusb		reg2,				newp			\
 __asm	pminub		newp,				oldp			\
 __asm	pmaxub		newp,				weight			\
 __asm	psubusb		weight,				oldp			\
 __asm	pmaxub		weight,				reg2
 #endif

 #ifdef	MODIFYPLUGIN
 #define		diagweightw5(oldp, newp, weight, center, bound1, bound2, wmem, reg1, reg2)	diagweight5(oldp, newp, weight, center, bound1, bound2, reg1, reg2)
 #else
 // same as diagweight5, but in addition bound2 is written to wmem
 #define		diagweightw5(oldp, newp, weight, center, bound1, bound2, wmem, reg1, reg2)	\
 __asm	SSE3_MOVE	newp,				bound1			\
 __asm	SSE3_MOVE	reg1,				bound2			\
 __asm	SSE_RMOVE	weight,				newp			\
 __asm	SSE_RMOVE	reg2,				oldp			\
 __asm	pmaxub		newp,				reg1			\
 __asm	pminub		weight,				reg1			\
 __asm	psubusb		reg2,				newp			\
 __asm	pminub		newp,				oldp			\
 __asm	SSE_MOVE	wmem,				reg1			\
 __asm	pmaxub		newp,				weight			\
 __asm	psubusb		weight,				oldp			\
 __asm	pmaxub		weight,				reg2
 #endif	// MODIFYPLUGIN

 void	diag5(BYTE *dp, int dpitch, const BYTE *_sp, int spitch, int hblocks, int remainder, int incpitch, int height)
 {
 __asm	mov			eax,				hblocks
 __asm	mov			ebx,				spitch
 #ifdef	MODIFYPLUGIN
 __asm	mov			ecx,				eax
 #endif
 __asm	mov			edx,				remainder
 #if		SSE_INCREMENT == 16
 __asm	add			eax,				eax
 #endif
 __asm	mov			esi,				_sp
 #ifdef	MODIFYPLUGIN
 __asm	lea			eax,				[eax * 8 + edx]
 #else
 __asm	lea			eax,				[eax * 8 + edx + SSE_INCREMENT + 1]
 #endif
 __asm	sub			esi,				ebx
 __asm	sub			dpitch,				eax
 __asm	neg			eax
 __asm	mov			edi,				dp
 #ifdef	MODIFYPLUGIN
 __asm	inc			edi
 __asm	lea			eax,				[ebx + eax]	
 #else
 __asm	lea			eax,				[ebx + eax + 1]	
 __asm	align		16
 __asm	column_loop:
 __asm	SSE3_MOVE	SSE0,				[esi + ebx + 1]		
 		diagweight5(SSE0, SSE1, SSE2, SSE5, [esi], [esi + 2*ebx + 2], SSE6, SSE7)
 		diagweight5(SSE0, SSE3, SSE4, SSE5, [esi + 2*ebx], [esi + 2], SSE6, SSE7)
 		mergeweighted(SSE1, SSE2, SSE3, SSE4)
 		diagweight5(SSE0, SSE3, SSE4, SSE5, [esi + 2*ebx + 1], [esi + 1], SSE6, SSE7)
 		mergeweighted(SSE1, SSE2, SSE3, SSE4)
 		diagweightw5(SSE0, SSE3, SSE4, SSE5, [esi + ebx + 2], [esi + ebx], [edi], SSE6, SSE7)
 		mergeweighted(SSE1, SSE2, SSE3, SSE4)
 __asm	SSE_MOVE	[edi + 1],			SSE1
 // now the pixels in the middle
 __asm	add			esi,				SSE_INCREMENT
 __asm	add			edi,				SSE_INCREMENT + 1
 __asm	mov			ecx,				hblocks
 #endif	// MODIFYPLUGIN
 __asm	align		16
 __asm	middle_loop:
 #ifdef	MODIFYPLUGIN
 __asm	SSE3_MOVE	SSE0,				[edi]
 #if	MODIFYPLUGIN > 0
 __asm	SSE3_MOVE	SSE5,				[esi + ebx + 1]
 #endif
 #else
 __asm	SSE3_MOVE	SSE0,				[esi + ebx + 1]
 #endif
 		diagweight5(SSE0, SSE1, SSE2, SSE5, [esi], [esi + 2*ebx + 2], SSE6, SSE7)
 		diagweight5(SSE0, SSE3, SSE4, SSE5, [esi + 2*ebx], [esi + 2], SSE6, SSE7)
 		mergeweighted(SSE1, SSE2, SSE3, SSE4)
 		diagweight5(SSE0, SSE3, SSE4, SSE5, [esi + 2*ebx + 1], [esi + 1], SSE6, SSE7)
 		mergeweighted(SSE1, SSE2, SSE3, SSE4)
 		diagweight5(SSE0, SSE3, SSE4, SSE5, [esi + ebx + 2], [esi + ebx], SSE6, SSE7)
 		mergeweighted(SSE1, SSE2, SSE3, SSE4)
 __asm	add			esi,				SSE_INCREMENT
 __asm	SSE_MOVE	[edi],				SSE1
 __asm	add			edi,				SSE_INCREMENT
 __asm	dec			ecx
 __asm	jnz			middle_loop
 // the last pixels
 __asm	add			esi,				edx
 __asm	add			edi,				edx
 #ifdef	MODIFYPLUGIN
 __asm	SSE3_MOVE	SSE0,				[edi]
 #if	MODIFYPLUGIN > 0
 __asm	SSE3_MOVE	SSE5,				[esi + ebx + 1]
 #endif
 #else
 __asm	SSE3_MOVE	SSE0,				[esi + ebx + 1]
 #endif
 		diagweight5(SSE0, SSE1, SSE2, SSE5, [esi], [esi + 2*ebx + 2], SSE6, SSE7)
 		diagweight5(SSE0, SSE3, SSE4, SSE5, [esi + 2*ebx], [esi + 2], SSE6, SSE7)
 		mergeweighted(SSE1, SSE2, SSE3, SSE4)
 		diagweight5(SSE0, SSE3, SSE4, SSE5, [esi + 2*ebx + 1], [esi + 1], SSE6, SSE7)
 		mergeweighted(SSE1, SSE2, SSE3, SSE4)
 		diagweightw5(SSE0, SSE3, SSE4, SSE5, [esi + ebx], [esi + ebx + 2], [edi + 1], SSE6, SSE7)
 		mergeweighted(SSE1, SSE2, SSE3, SSE4)
 __asm	SSE_MOVE	[edi],				SSE1
 __asm	add			esi,				eax
 __asm	add			edi,				dpitch
 __asm	dec			height
 #ifdef	MODIFYPLUGIN
 __asm	mov			ecx,				hblocks
 __asm	jnz			middle_loop
 #else
 __asm	jnz			column_loop
 #endif
 }

 #if	MODIFYPLUGIN > 0
 #define		diagweight6(oldp, newp, weight, center, bound1, bound2, reg1, reg2)	\
 __asm	SSE3_MOVE	newp,				bound1			\
 __asm	SSE3_MOVE	reg1,				bound2			\
 __asm	SSE_RMOVE	weight,				newp			\
 __asm	SSE_RMOVE	reg2,				oldp			\
 __asm	pmaxub		newp,				reg1			\
 __asm	pminub		weight,				reg1			\
 __asm	pmaxub		newp,				center			\
 __asm	pminub		weight,				center			\
 __asm	psubusb		reg2,				newp			\
 __asm	SSE_RMOVE	reg1,				newp			\
 __asm	pminub		newp,				oldp			\
 __asm	psubusb		reg1,				weight			\
 __asm	pmaxub		newp,				weight			\
 __asm	psubusb		weight,				oldp			\
 __asm	pmaxub		weight,				reg2			\
 __asm	paddusb		weight,				weight			\
 __asm	paddusb		weight,				reg1
 #else
 // the values bound1 and bound2 are loaded into SSE registers
 // then oldp is clipped with min(bound1, bound2) and max(bound1, bound2)
 // finally weight = 2*|oldp - newp| + |bound1 - bound2|
 // oldp is left unchanged
 #define		diagweight6(oldp, newp, weight, center, bound1, bound2, reg1, reg2)	\
 __asm	SSE3_MOVE	newp,				bound1			\
 __asm	SSE3_MOVE	reg1,				bound2			\
 __asm	SSE_RMOVE	weight,				newp			\
 __asm	SSE_RMOVE	reg2,				oldp			\
 __asm	pmaxub		newp,				reg1			\
 __asm	pminub		weight,				reg1			\
 __asm	psubusb		reg2,				newp			\
 __asm	SSE_RMOVE	reg1,				newp			\
 __asm	pminub		newp,				oldp			\
 __asm	psubusb		reg1,				weight			\
 __asm	pmaxub		newp,				weight			\
 __asm	psubusb		weight,				oldp			\
 __asm	pmaxub		weight,				reg2			\
 __asm	paddusb		weight,				weight			\
 __asm	paddusb		weight,				reg1
 #endif

 #ifdef	MODIFYPLUGIN
 #define		diagweightw6(oldp, newp, weight, center, bound1, bound2, wmem, reg1, reg2)	diagweight6(oldp, newp, weight, center, bound1, bound2, reg1, reg2)
 #else
 // same as diagweight6, but in addition bound2 is written to wmem
 #define		diagweightw6(oldp, newp, weight, center, bound1, bound2, wmem, reg1, reg2)	\
 __asm	SSE3_MOVE	newp,				bound1			\
 __asm	SSE3_MOVE	reg1,				bound2			\
 __asm	SSE_RMOVE	weight,				newp			\
 __asm	SSE_RMOVE	reg2,				oldp			\
 __asm	SSE_MOVE	wmem,				reg1			\
 __asm	pmaxub		newp,				reg1			\
 __asm	pminub		weight,				reg1			\
 __asm	psubusb		reg2,				newp			\
 __asm	SSE_RMOVE	reg1,				newp			\
 __asm	pminub		newp,				oldp			\
 __asm	psubusb		reg1,				weight			\
 __asm	pmaxub		newp,				weight			\
 __asm	psubusb		weight,				oldp			\
 __asm	pmaxub		weight,				reg2			\
 __asm	paddusb		weight,				weight			\
 __asm	paddusb		weight,				reg1
 #endif	// MODIFYPLUGIN

 void	diag6(BYTE *dp, int dpitch, const BYTE *_sp, int spitch, int hblocks, int remainder, int incpitch, int height)
 {
 __asm	mov			eax,				hblocks
 __asm	mov			ebx,				spitch
 #ifdef	MODIFYPLUGIN
 __asm	mov			ecx,				eax
 #endif
 __asm	mov			edx,				remainder
 #if		SSE_INCREMENT == 16
 __asm	add			eax,				eax
 #endif
 __asm	mov			esi,				_sp
 #ifdef	MODIFYPLUGIN
 __asm	lea			eax,				[eax * 8 + edx]
 #else
 __asm	lea			eax,				[eax * 8 + edx + SSE_INCREMENT + 1]
 #endif
 __asm	sub			esi,				ebx
 __asm	sub			dpitch,				eax
 __asm	neg			eax
 __asm	mov			edi,				dp
 #ifdef	MODIFYPLUGIN
 __asm	inc			edi
 __asm	lea			eax,				[ebx + eax]	
 #else
 __asm	lea			eax,				[ebx + eax + 1]	
 __asm	align		16
 __asm	column_loop:
 __asm	SSE3_MOVE	SSE0,				[esi + ebx + 1]		
 		diagweight6(SSE0, SSE1, SSE2, SSE5, [esi], [esi + 2*ebx + 2], SSE6, SSE7)
 		diagweight6(SSE0, SSE3, SSE4, SSE5, [esi + 2*ebx], [esi + 2], SSE6, SSE7)
 		mergeweighted(SSE1, SSE2, SSE3, SSE4)
 		diagweight6(SSE0, SSE3, SSE4, SSE5, [esi + 2*ebx + 1], [esi + 1], SSE6, SSE7)
 		mergeweighted(SSE1, SSE2, SSE3, SSE4)
 		diagweightw6(SSE0, SSE3, SSE4, SSE5, [esi + ebx + 2], [esi + ebx], [edi], SSE6, SSE7)
 		mergeweighted(SSE1, SSE2, SSE3, SSE4)
 __asm	SSE_MOVE	[edi + 1],			SSE1
 // now the pixels in the middle
 __asm	add			esi,				SSE_INCREMENT
 __asm	add			edi,				SSE_INCREMENT + 1
 __asm	mov			ecx,				hblocks
 #endif	// MODIFYPLUGIN
 __asm	align		16
 __asm	middle_loop:
 #ifdef	MODIFYPLUGIN
 __asm	SSE3_MOVE	SSE0,				[edi]
 #if	MODIFYPLUGIN > 0
 __asm	SSE3_MOVE	SSE5,				[esi + ebx + 1]
 #endif
 #else
 __asm	SSE3_MOVE	SSE0,				[esi + ebx + 1]
 #endif
 		diagweight6(SSE0, SSE1, SSE2, SSE5, [esi], [esi + 2*ebx + 2], SSE6, SSE7)
 		diagweight6(SSE0, SSE3, SSE4, SSE5, [esi + 2*ebx], [esi + 2], SSE6, SSE7)
 		mergeweighted(SSE1, SSE2, SSE3, SSE4)
 		diagweight6(SSE0, SSE3, SSE4, SSE5, [esi + 2*ebx + 1], [esi + 1], SSE6, SSE7)
 		mergeweighted(SSE1, SSE2, SSE3, SSE4)
 		diagweight6(SSE0, SSE3, SSE4, SSE5, [esi + ebx + 2], [esi + ebx], SSE6, SSE7)
 		mergeweighted(SSE1, SSE2, SSE3, SSE4)
 __asm	add			esi,				SSE_INCREMENT
 __asm	SSE_MOVE	[edi],				SSE1
 __asm	add			edi,				SSE_INCREMENT
 __asm	dec			ecx
 __asm	jnz			middle_loop
 // the last pixels
 __asm	add			esi,				edx
 __asm	add			edi,				edx
 #ifdef	MODIFYPLUGIN
 __asm	SSE3_MOVE	SSE0,				[edi]
 #if	MODIFYPLUGIN > 0
 __asm	SSE3_MOVE	SSE5,				[esi + ebx + 1]
 #endif
 #else
 __asm	SSE3_MOVE	SSE0,				[esi + ebx + 1]
 #endif
 		diagweight6(SSE0, SSE1, SSE2, SSE5, [esi], [esi + 2*ebx + 2], SSE6, SSE7)
 		diagweight6(SSE0, SSE3, SSE4, SSE5, [esi + 2*ebx], [esi + 2], SSE6, SSE7)
 		mergeweighted(SSE1, SSE2, SSE3, SSE4)
 		diagweight6(SSE0, SSE3, SSE4, SSE5, [esi + 2*ebx + 1], [esi + 1], SSE6, SSE7)
 		mergeweighted(SSE1, SSE2, SSE3, SSE4)
 		diagweightw6(SSE0, SSE3, SSE4, SSE5, [esi + ebx], [esi + ebx + 2], [edi + 1], SSE6, SSE7)
 		mergeweighted(SSE1, SSE2, SSE3, SSE4)
 __asm	SSE_MOVE	[edi],				SSE1
 __asm	add			esi,				eax
 __asm	add			edi,				dpitch
 __asm	dec			height
 #ifdef	MODIFYPLUGIN
 __asm	mov			ecx,				hblocks
 __asm	jnz			middle_loop
 #else
 __asm	jnz			column_loop
 #endif
 }

 #if	MODIFYPLUGIN > 0
 #define		diagweight7(oldp, newp, weight, center, bound1, bound2, reg1, reg2)	\
 __asm	SSE3_MOVE	newp,				bound1			\
 __asm	SSE3_MOVE	reg1,				bound2			\
 __asm	SSE_RMOVE	weight,				newp			\
 __asm	SSE_RMOVE	reg2,				oldp			\
 __asm	pmaxub		newp,				reg1			\
 __asm	pminub		weight,				reg1			\
 __asm	pmaxub		newp,				center			\
 __asm	pminub		weight,				center			\
 __asm	psubusb		reg2,				newp			\
 __asm	SSE_RMOVE	reg1,				newp			\
 __asm	pminub		newp,				oldp			\
 __asm	psubusb		reg1,				weight			\
 __asm	pmaxub		newp,				weight			\
 __asm	psubusb		weight,				oldp			\
 __asm	pmaxub		weight,				reg2			\
 __asm	paddusb		weight,				reg1
 #else
 // the values bound1 and bound2 are loaded into SSE registers
 // then oldp is clipped with min(bound1, bound2) and max(bound1, bound2)
 // finally weight = |oldp - newp| + |bound1 - bound2|
 // oldp is left unchanged
 #define		diagweight7(oldp, newp, weight, center, bound1, bound2, reg1, reg2)	\
 __asm	SSE3_MOVE	newp,				bound1			\
 __asm	SSE3_MOVE	reg1,				bound2			\
 __asm	SSE_RMOVE	weight,				newp			\
 __asm	SSE_RMOVE	reg2,				oldp			\
 __asm	pmaxub		newp,				reg1			\
 __asm	pminub		weight,				reg1			\
 __asm	psubusb		reg2,				newp			\
 __asm	SSE_RMOVE	reg1,				newp			\
 __asm	pminub		newp,				oldp			\
 __asm	psubusb		reg1,				weight			\
 __asm	pmaxub		newp,				weight			\
 __asm	psubusb		weight,				oldp			\
 __asm	pmaxub		weight,				reg2			\
 __asm	paddusb		weight,				reg1
 #endif

 #ifdef	MODIFYPLUGIN
 #define		diagweightw7(oldp, newp, weight, center, bound1, bound2, wmem, reg1, reg2)	diagweight7(oldp, newp, weight, center, bound1, bound2, reg1, reg2)
 #else
 // same as diagweight7, but in addition bound2 is written to wmem
 #define		diagweightw7(oldp, newp, weight, center, bound1, bound2, wmem, reg1, reg2)	\
 __asm	SSE3_MOVE	newp,				bound1			\
 __asm	SSE3_MOVE	reg1,				bound2			\
 __asm	SSE_RMOVE	weight,				newp			\
 __asm	SSE_RMOVE	reg2,				oldp			\
 __asm	SSE_MOVE	wmem,				reg1			\
 __asm	pmaxub		newp,				reg1			\
 __asm	pminub		weight,				reg1			\
 __asm	psubusb		reg2,				newp			\
 __asm	SSE_RMOVE	reg1,				newp			\
 __asm	pminub		newp,				oldp			\
 __asm	psubusb		reg1,				weight			\
 __asm	pmaxub		newp,				weight			\
 __asm	psubusb		weight,				oldp			\
 __asm	pmaxub		weight,				reg2			\
 __asm	paddusb		weight,				reg1
 #endif	// MODIFYPLUGIN

 void	diag7(BYTE *dp, int dpitch, const BYTE *_sp, int spitch, int hblocks, int remainder, int incpitch, int height)
 {
 __asm	mov			eax,				hblocks
 __asm	mov			ebx,				spitch
 #ifdef	MODIFYPLUGIN
 __asm	mov			ecx,				eax
 #endif
 __asm	mov			edx,				remainder
 #if		SSE_INCREMENT == 16
 __asm	add			eax,				eax
 #endif
 __asm	mov			esi,				_sp
 #ifdef	MODIFYPLUGIN
 __asm	lea			eax,				[eax * 8 + edx]
 #else
 __asm	lea			eax,				[eax * 8 + edx + SSE_INCREMENT + 1]
 #endif
 __asm	sub			esi,				ebx
 __asm	sub			dpitch,				eax
 __asm	neg			eax
 __asm	mov			edi,				dp
 #ifdef	MODIFYPLUGIN
 __asm	inc			edi
 __asm	lea			eax,				[ebx + eax]	
 #else
 __asm	lea			eax,				[ebx + eax + 1]	
 __asm	align		16
 __asm	column_loop:
 __asm	SSE3_MOVE	SSE0,				[esi + ebx + 1]		
 		diagweight7(SSE0, SSE1, SSE2, SSE5, [esi], [esi + 2*ebx + 2], SSE6, SSE7)
 		diagweight7(SSE0, SSE3, SSE4, SSE5, [esi + 2*ebx], [esi + 2], SSE6, SSE7)
 		mergeweighted(SSE1, SSE2, SSE3, SSE4)
 		diagweight7(SSE0, SSE3, SSE4, SSE5, [esi + 2*ebx + 1], [esi + 1], SSE6, SSE7)
 		mergeweighted(SSE1, SSE2, SSE3, SSE4)
 		diagweightw7(SSE0, SSE3, SSE4, SSE5, [esi + ebx + 2], [esi + ebx], [edi], SSE6, SSE7)
 		mergeweighted(SSE1, SSE2, SSE3, SSE4)
 __asm	SSE_MOVE	[edi + 1],			SSE1
 // now the pixels in the middle
 __asm	add			esi,				SSE_INCREMENT
 __asm	add			edi,				SSE_INCREMENT + 1
 __asm	mov			ecx,				hblocks
 #endif	// MODIFYPLUGIN
 __asm	align		16
 __asm	middle_loop:
 #ifdef	MODIFYPLUGIN
 __asm	SSE3_MOVE	SSE0,				[edi]
 #if	MODIFYPLUGIN > 0
 __asm	SSE3_MOVE	SSE5,				[esi + ebx + 1]
 #endif
 #else
 __asm	SSE3_MOVE	SSE0,				[esi + ebx + 1]
 #endif
 		diagweight7(SSE0, SSE1, SSE2, SSE5, [esi], [esi + 2*ebx + 2], SSE6, SSE7)
 		diagweight7(SSE0, SSE3, SSE4, SSE5, [esi + 2*ebx], [esi + 2], SSE6, SSE7)
 		mergeweighted(SSE1, SSE2, SSE3, SSE4)
 		diagweight7(SSE0, SSE3, SSE4, SSE5, [esi + 2*ebx + 1], [esi + 1], SSE6, SSE7)
 		mergeweighted(SSE1, SSE2, SSE3, SSE4)
 		diagweight7(SSE0, SSE3, SSE4, SSE5, [esi + ebx + 2], [esi + ebx], SSE6, SSE7)
 		mergeweighted(SSE1, SSE2, SSE3, SSE4)
 __asm	add			esi,				SSE_INCREMENT
 __asm	SSE_MOVE	[edi],				SSE1
 __asm	add			edi,				SSE_INCREMENT
 __asm	dec			ecx
 __asm	jnz			middle_loop
 // the last pixels
 __asm	add			esi,				edx
 __asm	add			edi,				edx
 #ifdef	MODIFYPLUGIN
 __asm	SSE3_MOVE	SSE0,				[edi]
 #if	MODIFYPLUGIN > 0
 __asm	SSE3_MOVE	SSE5,				[esi + ebx + 1]
 #endif
 #else
 __asm	SSE3_MOVE	SSE0,				[esi + ebx + 1]
 #endif
 		diagweight7(SSE0, SSE1, SSE2, SSE5, [esi], [esi + 2*ebx + 2], SSE6, SSE7)
 		diagweight7(SSE0, SSE3, SSE4, SSE5, [esi + 2*ebx], [esi + 2], SSE6, SSE7)
 		mergeweighted(SSE1, SSE2, SSE3, SSE4)
 		diagweight7(SSE0, SSE3, SSE4, SSE5, [esi + 2*ebx + 1], [esi + 1], SSE6, SSE7)
 		mergeweighted(SSE1, SSE2, SSE3, SSE4)
 		diagweightw7(SSE0, SSE3, SSE4, SSE5, [esi + ebx], [esi + ebx + 2], [edi + 1], SSE6, SSE7)
 		mergeweighted(SSE1, SSE2, SSE3, SSE4)
 __asm	SSE_MOVE	[edi],				SSE1
 __asm	add			esi,				eax
 __asm	add			edi,				dpitch
 __asm	dec			height
 #ifdef	MODIFYPLUGIN
 __asm	mov			ecx,				hblocks
 __asm	jnz			middle_loop
 #else
 __asm	jnz			column_loop
 #endif
 }

 void	diag7b(BYTE *dp, int dpitch, const BYTE *_sp, int spitch, int hblocks, int remainder, int incpitch, int height)
 {
 __asm	mov			eax,				hblocks
 __asm	mov			ebx,				spitch
 #ifdef	MODIFYPLUGIN
 __asm	mov			ecx,				eax
 #endif
 __asm	mov			edx,				remainder
 #if		SSE_INCREMENT == 16
 __asm	add			eax,				eax
 #endif
 __asm	mov			esi,				_sp
 #ifdef	MODIFYPLUGIN
 __asm	lea			eax,				[eax * 8 + edx]
 #else
 __asm	lea			eax,				[eax * 8 + edx + SSE_INCREMENT + 1]
 #endif
 __asm	sub			esi,				ebx
 __asm	sub			dpitch,				eax
 __asm	neg			eax
 __asm	mov			edi,				dp
 #ifdef	MODIFYPLUGIN
 __asm	inc			edi
 __asm	lea			eax,				[ebx + eax]	
 #else
 __asm	lea			eax,				[ebx + eax + 1]	
 __asm	align		16
 __asm	column_loop:
 __asm	SSE3_MOVE	SSE0,				[esi + ebx + 1]		
 		diagweight7(SSE0, SSE1, SSE2, SSE5, [esi], [esi + 2*ebx + 2], SSE6, SSE7)
 		diagweight7(SSE0, SSE3, SSE4, SSE5, [esi + 2*ebx], [esi + 2], SSE6, SSE7)
 __asm	movd		SSE6,				[esi + ebx]
 		mergeweighted(SSE1, SSE2, SSE3, SSE4)
 __asm	movd		[edi],				SSE6
 		diagweight7(SSE0, SSE3, SSE4, SSE5, [esi + 2*ebx + 1], [esi + 1], SSE6, SSE7)
 		mergeweighted(SSE1, SSE2, SSE3, SSE4)
 __asm	SSE_MOVE	[edi + 1],			SSE1
 // now the pixels in the middle
 __asm	add			esi,				SSE_INCREMENT
 __asm	add			edi,				SSE_INCREMENT + 1
 __asm	mov			ecx,				hblocks
 #endif	// MODIFYPLUGIN
 __asm	align		16
 __asm	middle_loop:
 #ifdef	MODIFYPLUGIN
 __asm	SSE3_MOVE	SSE0,				[edi]
 #if	MODIFYPLUGIN > 0
 __asm	SSE3_MOVE	SSE5,				[esi + ebx + 1]
 #endif
 #else
 __asm	SSE3_MOVE	SSE0,				[esi + ebx + 1]
 #endif
 		diagweight7(SSE0, SSE1, SSE2, SSE5, [esi], [esi + 2*ebx + 2], SSE6, SSE7)
 		diagweight7(SSE0, SSE3, SSE4, SSE5, [esi + 2*ebx], [esi + 2], SSE6, SSE7)
 		mergeweighted(SSE1, SSE2, SSE3, SSE4)
 		diagweight7(SSE0, SSE3, SSE4, SSE5, [esi + 2*ebx + 1], [esi + 1], SSE6, SSE7)
 		mergeweighted(SSE1, SSE2, SSE3, SSE4)
 __asm	add			esi,				SSE_INCREMENT
 __asm	SSE_MOVE	[edi],				SSE1
 __asm	add			edi,				SSE_INCREMENT
 __asm	dec			ecx
 __asm	jnz			middle_loop
 // the last pixels
 __asm	add			esi,				edx
 __asm	add			edi,				edx
 #ifdef	MODIFYPLUGIN
 __asm	SSE3_MOVE	SSE0,				[edi]
 #if	MODIFYPLUGIN > 0
 __asm	SSE3_MOVE	SSE5,				[esi + ebx + 1]
 #endif
 #else
 __asm	SSE3_MOVE	SSE0,				[esi + ebx + 1]
 #endif
 		diagweight7(SSE0, SSE1, SSE2, SSE5, [esi], [esi + 2*ebx + 2], SSE6, SSE7)
 		diagweight7(SSE0, SSE3, SSE4, SSE5, [esi + 2*ebx], [esi + 2], SSE6, SSE7)
 __asm	movd		SSE6,				[esi + ebx + 6]
 		mergeweighted(SSE1, SSE2, SSE3, SSE4)
 __asm	movd		[edi + 5],			SSE6
 		diagweight7(SSE0, SSE3, SSE4, SSE5, [esi + 2*ebx + 1], [esi + 1], SSE6, SSE7)
 		mergeweighted(SSE1, SSE2, SSE3, SSE4)
 __asm	SSE_MOVE	[edi],				SSE1
 __asm	add			esi,				eax
 __asm	add			edi,				dpitch
 __asm	dec			height
 #ifdef	MODIFYPLUGIN
 __asm	mov			ecx,				hblocks
 __asm	jnz			middle_loop
 #else
 __asm	jnz			column_loop
 #endif
 }

 #if	MODIFYPLUGIN > 0
 #define		diagweight8(oldp, newp, weight, center, bound1, bound2, reg1, reg2)	\
 __asm	SSE3_MOVE	newp,				bound1			\
 __asm	SSE3_MOVE	reg1,				bound2			\
 __asm	SSE_RMOVE	weight,				newp			\
 __asm	SSE_RMOVE	reg2,				oldp			\
 __asm	pmaxub		newp,				reg1			\
 __asm	pminub		weight,				reg1			\
 __asm	pmaxub		newp,				center			\
 __asm	pminub		weight,				center			\
 __asm	psubusb		reg2,				newp			\
 __asm	SSE_RMOVE	reg1,				newp			\
 __asm	pminub		newp,				oldp			\
 __asm	psubusb		reg1,				weight			\
 __asm	pmaxub		newp,				weight			\
 __asm	psubusb		weight,				oldp			\
 __asm	paddusb		reg1,				reg1			\
 __asm	pmaxub		weight,				reg2			\
 __asm	paddusb		weight,				reg1
 #else
 // the values bound1 and bound2 are loaded into SSE registers
 // then oldp is clipped with min(bound1, bound2) and max(bound1, bound2)
 // finally weight = |oldp - newp| + 2*|bound1 - bound2|
 // oldp is left unchanged
 #define		diagweight8(oldp, newp, weight, center, bound1, bound2, reg1, reg2)	\
 __asm	SSE3_MOVE	newp,				bound1			\
 __asm	SSE3_MOVE	reg1,				bound2			\
 __asm	SSE_RMOVE	weight,				newp			\
 __asm	SSE_RMOVE	reg2,				oldp			\
 __asm	pmaxub		newp,				reg1			\
 __asm	pminub		weight,				reg1			\
 __asm	psubusb		reg2,				newp			\
 __asm	SSE_RMOVE	reg1,				newp			\
 __asm	pminub		newp,				oldp			\
 __asm	psubusb		reg1,				weight			\
 __asm	pmaxub		newp,				weight			\
 __asm	psubusb		weight,				oldp			\
 __asm	paddusb		reg1,				reg1			\
 __asm	pmaxub		weight,				reg2			\
 __asm	paddusb		weight,				reg1
 #endif

 #ifdef	MODIFYPLUGIN
 #define		diagweightw8(oldp, newp, weight, center, bound1, bound2, wmem, reg1, reg2)	diagweight8(oldp, newp, weight, center, bound1, bound2, reg1, reg2)
 #else
 // same as diagweight8, but in addition bound2 is written to wmem
 #define		diagweightw8(oldp, newp, weight, center, bound1, bound2, wmem, reg1, reg2)	\
 __asm	SSE3_MOVE	newp,				bound1			\
 __asm	SSE3_MOVE	reg1,				bound2			\
 __asm	SSE_RMOVE	weight,				newp			\
 __asm	SSE_RMOVE	reg2,				oldp			\
 __asm	SSE_MOVE	wmem,				reg1			\
 __asm	pmaxub		newp,				reg1			\
 __asm	pminub		weight,				reg1			\
 __asm	psubusb		reg2,				newp			\
 __asm	SSE_RMOVE	reg1,				newp			\
 __asm	pminub		newp,				oldp			\
 __asm	psubusb		reg1,				weight			\
 __asm	pmaxub		newp,				weight			\
 __asm	psubusb		weight,				oldp			\
 __asm	paddusb		reg1,				reg1			\
 __asm	pmaxub		weight,				reg2			\
 __asm	paddusb		weight,				reg1
 #endif	// MODIFYPLUGIN

 void	diag8(BYTE *dp, int dpitch, const BYTE *_sp, int spitch, int hblocks, int remainder, int incpitch, int height)
 {
 __asm	mov			eax,				hblocks
 __asm	mov			ebx,				spitch
 #ifdef	MODIFYPLUGIN
 __asm	mov			ecx,				eax
 #endif
 __asm	mov			edx,				remainder
 #if		SSE_INCREMENT == 16
 __asm	add			eax,				eax
 #endif
 __asm	mov			esi,				_sp
 #ifdef	MODIFYPLUGIN
 __asm	lea			eax,				[eax * 8 + edx]
 #else
 __asm	lea			eax,				[eax * 8 + edx + SSE_INCREMENT + 1]
 #endif
 __asm	sub			esi,				ebx
 __asm	sub			dpitch,				eax
 __asm	neg			eax
 __asm	mov			edi,				dp
 #ifdef	MODIFYPLUGIN
 __asm	inc			edi
 __asm	lea			eax,				[ebx + eax]	
 #else
 __asm	lea			eax,				[ebx + eax + 1]	
 __asm	align		16
 __asm	column_loop:
 __asm	SSE3_MOVE	SSE0,				[esi + ebx + 1]		
 		diagweight8(SSE0, SSE1, SSE2, SSE5, [esi], [esi + 2*ebx + 2], SSE6, SSE7)
 		diagweight8(SSE0, SSE3, SSE4, SSE5, [esi + 2*ebx], [esi + 2], SSE6, SSE7)
 		mergeweighted(SSE1, SSE2, SSE3, SSE4)
 		diagweight8(SSE0, SSE3, SSE4, SSE5, [esi + 2*ebx + 1], [esi + 1], SSE6, SSE7)
 		mergeweighted(SSE1, SSE2, SSE3, SSE4)
 		diagweightw8(SSE0, SSE3, SSE4, SSE5, [esi + ebx + 2], [esi + ebx], [edi], SSE6, SSE7)
 		mergeweighted(SSE1, SSE2, SSE3, SSE4)
 __asm	SSE_MOVE	[edi + 1],			SSE1
 // now the pixels in the middle
 __asm	add			esi,				SSE_INCREMENT
 __asm	add			edi,				SSE_INCREMENT + 1
 __asm	mov			ecx,				hblocks
 #endif	// MODIFYPLUGIN
 __asm	align		16
 __asm	middle_loop:
 #ifdef	MODIFYPLUGIN
 __asm	SSE3_MOVE	SSE0,				[edi]
 #if	MODIFYPLUGIN > 0
 __asm	SSE3_MOVE	SSE5,				[esi + ebx + 1]
 #endif
 #else
 __asm	SSE3_MOVE	SSE0,				[esi + ebx + 1]
 #endif
 		diagweight8(SSE0, SSE1, SSE2, SSE5, [esi], [esi + 2*ebx + 2], SSE6, SSE7)
 		diagweight8(SSE0, SSE3, SSE4, SSE5, [esi + 2*ebx], [esi + 2], SSE6, SSE7)
 		mergeweighted(SSE1, SSE2, SSE3, SSE4)
 		diagweight8(SSE0, SSE3, SSE4, SSE5, [esi + 2*ebx + 1], [esi + 1], SSE6, SSE7)
 		mergeweighted(SSE1, SSE2, SSE3, SSE4)
 		diagweight8(SSE0, SSE3, SSE4, SSE5, [esi + ebx + 2], [esi + ebx], SSE6, SSE7)
 		mergeweighted(SSE1, SSE2, SSE3, SSE4)
 __asm	add			esi,				SSE_INCREMENT
 __asm	SSE_MOVE	[edi],				SSE1
 __asm	add			edi,				SSE_INCREMENT
 __asm	dec			ecx
 __asm	jnz			middle_loop
 // the last pixels
 __asm	add			esi,				edx
 __asm	add			edi,				edx
 #ifdef	MODIFYPLUGIN
 __asm	SSE3_MOVE	SSE0,				[edi]
 #if	MODIFYPLUGIN > 0
 __asm	SSE3_MOVE	SSE5,				[esi + ebx + 1]
 #endif
 #else
 __asm	SSE3_MOVE	SSE0,				[esi + ebx + 1]
 #endif
 		diagweight8(SSE0, SSE1, SSE2, SSE5, [esi], [esi + 2*ebx + 2], SSE6, SSE7)
 		diagweight8(SSE0, SSE3, SSE4, SSE5, [esi + 2*ebx], [esi + 2], SSE6, SSE7)
 		mergeweighted(SSE1, SSE2, SSE3, SSE4)
 		diagweight8(SSE0, SSE3, SSE4, SSE5, [esi + 2*ebx + 1], [esi + 1], SSE6, SSE7)
 		mergeweighted(SSE1, SSE2, SSE3, SSE4)
 		diagweightw8(SSE0, SSE3, SSE4, SSE5, [esi + ebx], [esi + ebx + 2], [edi + 1], SSE6, SSE7)
 		mergeweighted(SSE1, SSE2, SSE3, SSE4)
 __asm	SSE_MOVE	[edi],				SSE1
 __asm	add			esi,				eax
 __asm	add			edi,				dpitch
 __asm	dec			height
 #ifdef	MODIFYPLUGIN
 __asm	mov			ecx,				hblocks
 __asm	jnz			middle_loop
 #else
 __asm	jnz			column_loop
 #endif
 }

 #endif	// #ifndef	SHLUR

 #if	MODIFYPLUGIN > 0
 #define	get_min_weight(min, weight, center, mem1, mem2,reg)	\
 __asm	SSE3_MOVE	min,				mem1		\
 __asm	SSE3_MOVE	reg,				mem2		\
 __asm	SSE_RMOVE	weight,				min			\
 __asm	pminub		min,				center		\
 __asm	pmaxub		weight,				center		\
 __asm	pminub		min,				reg			\
 __asm	pmaxub		weight,				reg			\
 __asm	psubusb		weight,				min
 #else
 #define	get_min_weight(min, weight, center, mem1, mem2,reg)	\
 __asm	SSE3_MOVE	min,				mem1		\
 __asm	SSE3_MOVE	reg,				mem2		\
 __asm	SSE_RMOVE	weight,				min			\
 __asm	pminub		min,				reg			\
 __asm	pmaxub		weight,				reg			\
 __asm	psubusb		weight,				min
 #endif

 #ifdef	MODIFYPLUGIN
 #define	get_min_weightw(min, weight, center, mem1, mem2, wmem, reg)	get_min_weight(min, weight, center, mem1, mem2,reg)
 #else
 #define	get_min_weightw(min, weight, center, mem1, mem2, wmem, reg)	\
 __asm	SSE3_MOVE	min,				mem1		\
 __asm	SSE3_MOVE	reg,				mem2		\
 __asm	SSE_RMOVE	weight,				min			\
 __asm	pminub		min,				reg			\
 __asm	pmaxub		weight,				reg			\
 __asm	SSE_MOVE	wmem,				reg			\
 __asm	psubb		weight,				min
 #endif	// MODIFYPLUGIN

 #ifdef	SHARPEN
 void	diag9(BYTE *dp, int dpitch, const BYTE *_sp, int spitch, int hblocks, int remainder, int incpitch, int height, int strength)
 #else
 void	diag9(BYTE *dp, int dpitch, const BYTE *_sp, int spitch, int hblocks, int remainder, int incpitch, int height)
 #endif
 {
 #ifdef	SHARPEN
 __asm	mov			ebx,				strength
 __asm	SSE_RMOVE	SSE4,				rshift[ebx]
 __asm	SSE_RMOVE	SSE6,				shift_mask[ebx]
 #endif
 __asm	mov			eax,				hblocks
 __asm	mov			ebx,				spitch
 #ifdef	MODIFYPLUGIN
 __asm	mov			ecx,				eax
 #endif
 __asm	mov			edx,				remainder
 #if		SSE_INCREMENT == 16
 __asm	add			eax,				eax
 #endif
 __asm	mov			esi,				_sp
 #ifdef	MODIFYPLUGIN
 __asm	lea			eax,				[eax * 8 + edx]
 #else
 __asm	lea			eax,				[eax * 8 + edx + SSE_INCREMENT + 1]
 #endif
 __asm	sub			esi,				ebx
 __asm	sub			dpitch,				eax
 __asm	neg			eax
 __asm	mov			edi,				dp
 #ifdef	MODIFYPLUGIN
 __asm	inc			edi
 __asm	lea			eax,				[ebx + eax]	
 #else
 __asm	lea			eax,				[ebx + eax + 1]	
 __asm	align		16
 __asm	column_loop:
 		get_min_weight(SSE0, SSE1, SSE5, [esi], [esi + 2*ebx + 2], SSE7)
 		get_min_weight(SSE2, SSE3, SSE5, [esi + 2*ebx], [esi + 2], SSE7)
 		mergeweighted(SSE0, SSE1, SSE2, SSE3)
 		get_min_weight(SSE2, SSE3, SSE5, [esi + 2*ebx + 1], [esi + 1], SSE7)
 		mergeweighted(SSE0, SSE1, SSE2, SSE3)
 		get_min_weightw(SSE2, SSE3, SSE5, [esi + ebx + 2], [esi + ebx], [edi], SSE7)
 		mergeweighted(SSE0, SSE1, SSE2, SSE3)
 #if		(ISSE > 1) || defined(SHLUR)
 __asm	SSE3_MOVE	SSE7,				[esi + ebx + 1]
 #endif
 __asm	paddusb		SSE1,				SSE0
 #ifdef	SHLUR
 		sharpen(SSE7, SSE0, SSE1, SSE4, SSE6, SSE2, SSE3)
 __asm	SSE_MOVE	[edi + 1],			SSE7
 #else
 #if		ISSE > 1
 __asm	pmaxub		SSE0,				SSE7			
 #else
 __asm	pmaxub		SSE0,				[esi + ebx + 1]
 #endif
 __asm	pminub		SSE0,				SSE1
 __asm	SSE_MOVE	[edi + 1],			SSE0
 #endif	// SHLUR
 // now the pixels in the middle
 __asm	add			esi,				SSE_INCREMENT
 __asm	add			edi,				SSE_INCREMENT + 1
 __asm	mov			ecx,				hblocks
 #endif	// MODIFYPLUGIN
 __asm	align		16
 __asm	middle_loop:
 #if	MODIFYPLUGIN > 0
 __asm	SSE3_MOVE	SSE5,				[esi + ebx + 1]
 #endif
 		get_min_weight(SSE0, SSE1, SSE5,[esi], [esi + 2*ebx + 2], SSE7)
 		get_min_weight(SSE2, SSE3, SSE5, [esi + 2*ebx], [esi + 2], SSE7)
 		mergeweighted(SSE0, SSE1, SSE2, SSE3)
 		get_min_weight(SSE2, SSE3, SSE5, [esi + 2*ebx + 1], [esi + 1], SSE7)
 		mergeweighted(SSE0, SSE1, SSE2, SSE3)
 		get_min_weight(SSE2, SSE3, SSE5, [esi + ebx + 2], [esi + ebx], SSE7)
 		mergeweighted(SSE0, SSE1, SSE2, SSE3)
 #if		(ISSE > 1) || defined(SHLUR)
 #ifdef	MODIFYPLUGIN
 __asm	SSE3_MOVE	SSE7,				[edi]
 #else
 __asm	SSE3_MOVE	SSE7,				[esi + ebx + 1]
 #endif
 #endif	// (ISSE > 1) || defined(SHLUR)
 __asm	paddusb		SSE1,				SSE0
 #ifdef	SHLUR
 		sharpen(SSE7, SSE0, SSE1, SSE4, SSE6, SSE2, SSE3)
 __asm	add			esi,				SSE_INCREMENT
 __asm	SSE_MOVE	[edi],				SSE7
 #else
 #if		ISSE > 1
 __asm	pmaxub		SSE0,				SSE7			
 #else
 #ifdef	MODIFYPLUGIN
 __asm	pmaxub		SSE0,				[edi]
 #else
 __asm	pmaxub		SSE0,				[esi + ebx + 1]
 #endif
 #endif	// ISSE > 1
 __asm	pminub		SSE0,				SSE1
 __asm	add			esi,				SSE_INCREMENT
 __asm	SSE_MOVE	[edi],				SSE0
 #endif	// SHLUR
 __asm	add			edi,				SSE_INCREMENT
 __asm	dec			ecx
 __asm	jnz			middle_loop
 // the last pixels
 __asm	add			esi,				edx
 __asm	add			edi,				edx
 #if	MODIFYPLUGIN > 0
 __asm	SSE3_MOVE	SSE5,				[esi + ebx + 1]
 #endif
 		get_min_weight(SSE0, SSE1, SSE5, [esi], [esi + 2*ebx + 2], SSE7)
 		get_min_weight(SSE2, SSE3, SSE5, [esi + 2*ebx], [esi + 2], SSE7)
 		mergeweighted(SSE0, SSE1, SSE2, SSE3)
 		get_min_weight(SSE2, SSE3, SSE5, [esi + 2*ebx + 1], [esi + 1], SSE7)
 		mergeweighted(SSE0, SSE1, SSE2, SSE3)
 		get_min_weightw(SSE2, SSE3, SSE5, [esi + ebx], [esi + ebx + 2], [edi + 1], SSE7)
 		mergeweighted(SSE0, SSE1, SSE2, SSE3)
 #if		(ISSE > 1) || defined(SHLUR)
 #ifdef	MODIFYPLUGIN
 __asm	SSE3_MOVE	SSE7,				[edi]
 #else
 __asm	SSE3_MOVE	SSE7,				[esi + ebx + 1]
 #endif
 #endif	// (ISSE > 1) || defined(SHLUR)
 __asm	paddusb		SSE1,				SSE0
 #ifdef	SHLUR
 		sharpen(SSE7, SSE0, SSE1, SSE4, SSE6, SSE2, SSE3)
 __asm	add			esi,				eax
 __asm	SSE_MOVE	[edi],				SSE7
 #else
 #if		ISSE > 1
 __asm	pmaxub		SSE0,				SSE7			
 #else
 #ifdef	MODIFYPLUGIN
 __asm	pmaxub		SSE0,				[edi]
 #else
 __asm	pmaxub		SSE0,				[esi + ebx + 1]
 #endif
 #endif	// ISSE > 1
 __asm	pminub		SSE0,				SSE1
 __asm	add			esi,				eax
 __asm	SSE_MOVE	[edi],				SSE0
 #endif	// SHLUR
 __asm	add			edi,				dpitch
 __asm	dec			height
 #ifdef	MODIFYPLUGIN
 __asm	mov			ecx,				hblocks
 __asm	jnz			middle_loop
 #else
 __asm	jnz			column_loop
 #endif
 }

 #define	get_val_weight(val, weight, mem, center, reg)		\
 __asm	SSE3_MOVE	val,				mem			\
 __asm	SSE_RMOVE	weight,				center		\
 __asm	SSE_RMOVE	reg,				center		\
 __asm	pmaxub		weight,				val			\
 __asm	pminub		reg,				val			\
 __asm	psubusb		weight,				reg

 #ifdef	MODIFYPLUGIN
 #define	get_val_weightw(val, weight, mem, center, wmem, reg)	get_val_weight(val, weight, mem, center, reg)
 #else
 #define	get_val_weightw1(val, weight, mem, center, wmem, reg)	\
 __asm	SSE3_MOVE	val,				mem			\
 __asm	SSE_RMOVE	weight,				center		\
 __asm	SSE_RMOVE	reg,				center		\
 __asm	pmaxub		weight,				val			\
 __asm	pminub		reg,				val			\
 __asm	movd		wmem,				val			\
 __asm	psubusb		weight,				reg

 #define	get_val_weightw(val, weight, mem, center, wmem, reg)	\
 __asm	SSE3_MOVE	val,				mem			\
 __asm	SSE_RMOVE	weight,				center		\
 __asm	SSE_RMOVE	reg,				center		\
 __asm	pmaxub		weight,				val			\
 __asm	pminub		reg,				val			\
 __asm	SSE_MOVE	wmem,				val			\
 __asm	psubusb		weight,				reg
 #endif	// MODIFYPLUGIN

 void	SSE_RemoveGrain10(BYTE *dp, int dpitch, const BYTE *_sp, int spitch, int hblocks, int remainder, int incpitch, int height)
 {
 __asm	mov			eax,				hblocks
 __asm	mov			ebx,				spitch
 #ifdef	MODIFYPLUGIN
 __asm	mov			ecx,				eax
 #endif
 __asm	mov			edx,				remainder
 #if		SSE_INCREMENT == 16
 __asm	add			eax,				eax
 #endif
 __asm	mov			esi,				_sp
 #ifdef	MODIFYPLUGIN
 __asm	lea			eax,				[eax * 8 + edx]
 #else
 __asm	lea			eax,				[eax * 8 + edx + SSE_INCREMENT + 1]
 #endif
 __asm	sub			esi,				ebx
 __asm	sub			dpitch,				eax
 __asm	neg			eax
 __asm	mov			edi,				dp
 #ifdef	MODIFYPLUGIN
 __asm	inc			edi
 __asm	lea			eax,				[ebx + eax]	
 #else
 __asm	lea			eax,				[ebx + eax + 1]	
 __asm	align		16
 __asm	column_loop:
 __asm	SSE3_MOVE	SSE1,				[esi + ebx + 1]
 		get_val_weightw1(SSE2, SSE3, [esi + ebx], SSE1, [edi], SSE7)
 		get_val_weight(SSE4, SSE5, [esi + ebx + 2], SSE1, SSE7)
 		mergeweighted(SSE2, SSE3, SSE4, SSE5)
 		get_val_weight(SSE4, SSE5, [esi], SSE1, SSE7)
 		mergeweighted(SSE2, SSE3, SSE4, SSE5)
 		get_val_weight(SSE4, SSE5, [esi + 2], SSE1, SSE7)
 		mergeweighted(SSE2, SSE3, SSE4, SSE5)
 		get_val_weight(SSE4, SSE5, [esi + 1], SSE1, SSE7)
 		mergeweighted(SSE2, SSE3, SSE4, SSE5)
 		get_val_weight(SSE4, SSE5, [esi + 2*ebx], SSE1, SSE7)
 		mergeweighted(SSE2, SSE3, SSE4, SSE5)
 		get_val_weight(SSE4, SSE5, [esi + 2*ebx + 2], SSE1, SSE7)
 		mergeweighted(SSE2, SSE3, SSE4, SSE5)
 		get_val_weight(SSE4, SSE5, [esi + 2*ebx + 1], SSE1, SSE7)
 		mergeweighted(SSE2, SSE3, SSE4, SSE5)
 __asm	SSE_MOVE	SSE4,				SSE2
 __asm	pminub		SSE1,				SSE2
 __asm	pmaxub		SSE1,				SSE4
 __asm	SSE_MOVE	[edi + 1],			SSE1
 // now the pixels in the middle
 __asm	add			esi,				SSE_INCREMENT
 __asm	add			edi,				SSE_INCREMENT + 1
 __asm	mov			ecx,				hblocks
 #endif // MODIFYPLUGIN
 __asm	align		16
 __asm	middle_loop:
 #ifdef	MODIFYPLUGIN
 __asm	SSE3_MOVE	SSE1,				[edi]
 #else
 __asm	SSE3_MOVE	SSE1,				[esi + ebx + 1]
 #endif
 		get_val_weight(SSE2, SSE3, [esi + ebx], SSE1, SSE7)
 #if	MODIFYPLUGIN == 1
 		get_val_weight(SSE4, SSE5, [esi + ebx + 1], SSE1, SSE7)
 		mergeweighted(SSE2, SSE3, SSE4, SSE5)
 #endif
 		get_val_weightw(SSE4, SSE5, [esi + ebx + 2], SSE1, [edi + 1], SSE7)
 		mergeweighted(SSE2, SSE3, SSE4, SSE5)
 		get_val_weight(SSE4, SSE5, [esi], SSE1, SSE7)
 		mergeweighted(SSE2, SSE3, SSE4, SSE5)
 		get_val_weight(SSE4, SSE5, [esi + 2], SSE1, SSE7)
 		mergeweighted(SSE2, SSE3, SSE4, SSE5)
 		get_val_weight(SSE4, SSE5, [esi + 1], SSE1, SSE7)
 		mergeweighted(SSE2, SSE3, SSE4, SSE5)
 		get_val_weight(SSE4, SSE5, [esi + 2*ebx], SSE1, SSE7)
 		mergeweighted(SSE2, SSE3, SSE4, SSE5)
 		get_val_weight(SSE4, SSE5, [esi + 2*ebx + 2], SSE1, SSE7)
 		mergeweighted(SSE2, SSE3, SSE4, SSE5)
 		get_val_weight(SSE4, SSE5, [esi + 2*ebx + 1], SSE1, SSE7)
 		mergeweighted(SSE2, SSE3, SSE4, SSE5)
 __asm	SSE_MOVE	SSE4,				SSE2
 __asm	pminub		SSE1,				SSE2
 __asm	add			esi,				SSE_INCREMENT
 __asm	pmaxub		SSE1,				SSE4
 __asm	SSE_MOVE	[edi],				SSE1
 __asm	add			edi,				SSE_INCREMENT
 __asm	dec			ecx
 __asm	jnz			middle_loop
 // the last pixels
 __asm	add			esi,				edx
 __asm	add			edi,				edx
 #ifdef	MODIFYPLUGIN
 __asm	SSE3_MOVE	SSE1,				[edi]
 #else
 __asm	SSE3_MOVE	SSE1,				[esi + ebx + 1]
 #endif
 		get_val_weight(SSE2, SSE3, [esi + ebx], SSE1, SSE7)
 #if	MODIFYPLUGIN == 1
 		get_val_weight(SSE4, SSE5, [esi + ebx + 1], SSE1, SSE7)
 		mergeweighted(SSE2, SSE3, SSE4, SSE5)
 #endif
 		get_val_weight(SSE4, SSE5, [esi + ebx + 2], SSE1, SSE7)
 		mergeweighted(SSE2, SSE3, SSE4, SSE5)
 		get_val_weight(SSE4, SSE5, [esi], SSE1, SSE7)
 		mergeweighted(SSE2, SSE3, SSE4, SSE5)
 		get_val_weight(SSE4, SSE5, [esi + 2], SSE1, SSE7)
 		mergeweighted(SSE2, SSE3, SSE4, SSE5)
 		get_val_weight(SSE4, SSE5, [esi + 1], SSE1, SSE7)
 		mergeweighted(SSE2, SSE3, SSE4, SSE5)
 		get_val_weight(SSE4, SSE5, [esi + 2*ebx], SSE1, SSE7)
 		mergeweighted(SSE2, SSE3, SSE4, SSE5)
 		get_val_weight(SSE4, SSE5, [esi + 2*ebx + 2], SSE1, SSE7)
 		mergeweighted(SSE2, SSE3, SSE4, SSE5)
 		get_val_weight(SSE4, SSE5, [esi + 2*ebx + 1], SSE1, SSE7)
 		mergeweighted(SSE2, SSE3, SSE4, SSE5)
 __asm	SSE_MOVE	SSE4,				SSE2
 __asm	pminub		SSE1,				SSE2
 __asm	pmaxub		SSE1,				SSE4
 __asm	add			esi,				eax
 __asm	SSE_MOVE	[edi],				SSE1
 __asm	add			edi,				dpitch
 __asm	dec			height
 #ifdef	MODIFYPLUGIN
 __asm	mov			ecx,				hblocks
 __asm	jnz			middle_loop
 #else
 __asm	jnz			column_loop
 #endif
 }

 #if	!(defined(MODIFYPLUGIN) || defined(SHLUR))
 #define neighbourdiff(minus, plus, center1, center2, neighbour, nullreg)	\
 __asm	SSE_RMOVE	center1,			center2		\
 __asm	psubusb		center2,			neighbour	\
 __asm	psubusb		neighbour,			center1		\
 __asm	SSE_RMOVE	minus,				center2		\
 __asm	SSE_RMOVE	plus,				neighbour	\
 __asm	pcmpeqb		center2,			nullreg		\
 __asm	pcmpeqb		neighbour,			nullreg		\
 __asm	por			minus,				center2		\
 __asm	pand		center2,			neighbour	\
 __asm	por			plus,				neighbour	\
 __asm	psubusb		minus,				center2		\
 __asm	psubusb		plus,				center2

 #define neighbourdiff_w(minus, plus, center1, center2, dest, neighbour, nullreg, mwrite)	\
 __asm	SSE_RMOVE	center1,			center2		\
 __asm	mwrite		dest,				neighbour	\
 __asm	psubusb		center2,			neighbour	\
 __asm	psubusb		neighbour,			center1		\
 __asm	SSE_RMOVE	minus,				center2		\
 __asm	SSE_RMOVE	plus,				neighbour	\
 __asm	pcmpeqb		center2,			nullreg		\
 __asm	pcmpeqb		neighbour,			nullreg		\
 __asm	por			minus,				center2		\
 __asm	pand		center2,			neighbour	\
 __asm	por			plus,				neighbour	\
 __asm	psubusb		minus,				center2		\
 __asm	psubusb		plus,				center2

 #define	SHIFT_MASK1	127

 static	const __declspec(align(SSE_INCREMENT)) BYTE	shift_mask[SSE_INCREMENT] =
 	{
 		SHIFT_MASK1, SHIFT_MASK1, SHIFT_MASK1, SHIFT_MASK1, SHIFT_MASK1, SHIFT_MASK1, SHIFT_MASK1, SHIFT_MASK1		
 #if	SSE_INCREMENT == 16
 		, SHIFT_MASK1, SHIFT_MASK1, SHIFT_MASK1, SHIFT_MASK1, SHIFT_MASK1, SHIFT_MASK1, SHIFT_MASK1, SHIFT_MASK1
 #endif
 	};

 #define	sharpen(center, minus, plus, reg1, reg2)\
 __asm	SSE_RMOVE	reg1,				minus			\
 __asm	SSE_RMOVE	reg2,				plus			\
 __asm	psubusb		reg1,				plus			\
 __asm	psubusb		reg2,				minus			\
 __asm	psrlw		plus,				1				\
 __asm	psrlw		minus,				1				\
 __asm	pand		plus,				shift_mask		\
 __asm	pand		minus,				shift_mask		\
 __asm	pminub		plus,				reg1			\
 __asm	pminub		minus,				reg2			\
 __asm	paddusb		center,				plus			\
 __asm	psubusb		center,				minus

 void	nondestructivesharpen(BYTE *dp, int dpitch, const BYTE *_sp, int spitch, int hblocks, int remainder, int incpitch, int height)
 {
 __asm	mov			eax,				hblocks
 __asm	mov			ebx,				spitch
 __asm	mov			edx,				remainder
 __asm	pxor		SSE0,				SSE0
 #if		SSE_INCREMENT == 16
 __asm	add			eax,				eax
 #endif
 __asm	mov			esi,				_sp
 __asm	lea			eax,				[eax * 8 + edx + SSE_INCREMENT + 1]
 __asm	sub			esi,				ebx
 __asm	sub			dpitch,				eax
 __asm	neg			eax
 __asm	mov			edi,				dp
 __asm	lea			eax,				[ebx + eax + 1]	
 __asm	align		16
 __asm	column_loop:
 __asm	SSE3_MOVE	SSE1,				[esi + ebx + 1]
 __asm	SSE3_MOVE	SSE3,				[esi + ebx]	
 		neighbourdiff_w(SSE4, SSE5, SSE2, SSE1, [edi], SSE3, SSE0, movd)

 __asm	SSE3_MOVE	SSE3,				[esi + ebx + 2]	
 		neighbourdiff(SSE6, SSE7, SSE1, SSE2, SSE3, SSE0)
 __asm	pminub		SSE4,				SSE6
 __asm	pminub		SSE5,				SSE7

 __asm	SSE3_MOVE	SSE3,				[esi]	
 		neighbourdiff(SSE6, SSE7, SSE2, SSE1, SSE3, SSE0)
 __asm	pminub		SSE4,				SSE6
 __asm	pminub		SSE5,				SSE7

 __asm	SSE3_MOVE	SSE3,				[esi + 1]	
 		neighbourdiff(SSE6, SSE7, SSE1, SSE2, SSE3, SSE0)
 __asm	pminub		SSE4,				SSE6
 __asm	pminub		SSE5,				SSE7

 __asm	SSE3_MOVE	SSE3,				[esi + 2]	
 		neighbourdiff(SSE6, SSE7, SSE2, SSE1, SSE3, SSE0)
 __asm	pminub		SSE4,				SSE6
 __asm	pminub		SSE5,				SSE7

 __asm	SSE3_MOVE	SSE3,				[esi + 2*ebx]	
 		neighbourdiff(SSE6, SSE7, SSE1, SSE2, SSE3, SSE0)
 __asm	pminub		SSE4,				SSE6
 __asm	pminub		SSE5,				SSE7

 __asm	SSE3_MOVE	SSE3,				[esi + 2*ebx + 1]	
 		neighbourdiff(SSE6, SSE7, SSE2, SSE1, SSE3, SSE0)
 __asm	pminub		SSE4,				SSE6
 __asm	pminub		SSE5,				SSE7

 __asm	SSE3_MOVE	SSE3,				[esi + 2*ebx + 2]	
 		neighbourdiff(SSE6, SSE7, SSE1, SSE2, SSE3, SSE0)
 __asm	pminub		SSE4,				SSE6
 __asm	pminub		SSE5,				SSE7
 		sharpen(SSE1, SSE4, SSE5, SSE6, SSE7)
 __asm	SSE_MOVE	[edi + 1],			SSE1
 // now the pixels in the middle
 __asm	add			esi,				SSE_INCREMENT
 __asm	add			edi,				SSE_INCREMENT + 1
 __asm	mov			ecx,				hblocks
 __asm	align		16
 __asm	middle_loop:
 __asm	SSE3_MOVE	SSE1,				[esi + ebx + 1]
 __asm	SSE3_MOVE	SSE3,				[esi + ebx]	
 		neighbourdiff(SSE4, SSE5, SSE2, SSE1, SSE3, SSE0)

 __asm	SSE3_MOVE	SSE3,				[esi + ebx + 2]	
 		neighbourdiff(SSE6, SSE7, SSE1, SSE2, SSE3, SSE0)
 __asm	pminub		SSE4,				SSE6
 __asm	pminub		SSE5,				SSE7

 __asm	SSE3_MOVE	SSE3,				[esi]	
 		neighbourdiff(SSE6, SSE7, SSE2, SSE1, SSE3, SSE0)
 __asm	pminub		SSE4,				SSE6
 __asm	pminub		SSE5,				SSE7

 __asm	SSE3_MOVE	SSE3,				[esi + 1]	
 		neighbourdiff(SSE6, SSE7, SSE1, SSE2, SSE3, SSE0)
 __asm	pminub		SSE4,				SSE6
 __asm	pminub		SSE5,				SSE7

 __asm	SSE3_MOVE	SSE3,				[esi + 2]	
 		neighbourdiff(SSE6, SSE7, SSE2, SSE1, SSE3, SSE0)
 __asm	pminub		SSE4,				SSE6
 __asm	pminub		SSE5,				SSE7

 __asm	SSE3_MOVE	SSE3,				[esi + 2*ebx]	
 		neighbourdiff(SSE6, SSE7, SSE1, SSE2, SSE3, SSE0)
 __asm	pminub		SSE4,				SSE6
 __asm	pminub		SSE5,				SSE7

 __asm	SSE3_MOVE	SSE3,				[esi + 2*ebx + 1]	
 		neighbourdiff(SSE6, SSE7, SSE2, SSE1, SSE3, SSE0)
 __asm	pminub		SSE4,				SSE6
 __asm	pminub		SSE5,				SSE7

 __asm	SSE3_MOVE	SSE3,				[esi + 2*ebx + 2]	
 		neighbourdiff(SSE6, SSE7, SSE1, SSE2, SSE3, SSE0)
 __asm	pminub		SSE4,				SSE6
 __asm	pminub		SSE5,				SSE7
 __asm	add			esi,				SSE_INCREMENT
 		sharpen(SSE1, SSE4, SSE5, SSE6, SSE7)
 __asm	SSE_MOVE	[edi],				SSE1
 __asm	add			edi,				SSE_INCREMENT
 __asm	dec			ecx
 __asm	jnz			middle_loop
 // the last pixels
 __asm	add			esi,				edx
 __asm	add			edi,				edx
 __asm	SSE3_MOVE	SSE1,				[esi + ebx + 1]
 __asm	SSE3_MOVE	SSE3,				[esi + ebx]	
 		neighbourdiff(SSE4, SSE5, SSE2, SSE1, SSE3, SSE0)

 __asm	SSE3_MOVE	SSE3,				[esi + ebx + 2]	
 		neighbourdiff_w(SSE6, SSE7, SSE1, SSE2, [edi + 1], SSE3, SSE0, SSE_MOVE)
 __asm	pminub		SSE4,				SSE6
 __asm	pminub		SSE5,				SSE7

 __asm	SSE3_MOVE	SSE3,				[esi]	
 		neighbourdiff(SSE6, SSE7, SSE2, SSE1, SSE3, SSE0)
 __asm	pminub		SSE4,				SSE6
 __asm	pminub		SSE5,				SSE7

 __asm	SSE3_MOVE	SSE3,				[esi + 1]	
 		neighbourdiff(SSE6, SSE7, SSE1, SSE2, SSE3, SSE0)
 __asm	pminub		SSE4,				SSE6
 __asm	pminub		SSE5,				SSE7

 __asm	SSE3_MOVE	SSE3,				[esi + 2]	
 		neighbourdiff(SSE6, SSE7, SSE2, SSE1, SSE3, SSE0)
 __asm	pminub		SSE4,				SSE6
 __asm	pminub		SSE5,				SSE7

 __asm	SSE3_MOVE	SSE3,				[esi + 2*ebx]	
 		neighbourdiff(SSE6, SSE7, SSE1, SSE2, SSE3, SSE0)
 __asm	pminub		SSE4,				SSE6
 __asm	pminub		SSE5,				SSE7

 __asm	SSE3_MOVE	SSE3,				[esi + 2*ebx + 1]	
 		neighbourdiff(SSE6, SSE7, SSE2, SSE1, SSE3, SSE0)
 __asm	pminub		SSE4,				SSE6
 __asm	pminub		SSE5,				SSE7

 __asm	SSE3_MOVE	SSE3,				[esi + 2*ebx + 2]	
 		neighbourdiff(SSE6, SSE7, SSE1, SSE2, SSE3, SSE0)
 __asm	pminub		SSE4,				SSE6
 __asm	pminub		SSE5,				SSE7
 __asm	add			esi,				eax
 		sharpen(SSE1, SSE4, SSE5, SSE6, SSE7)
 __asm	SSE_MOVE	[edi],				SSE1
 __asm	add			edi,				dpitch
 __asm	dec			height
 __asm	jnz			column_loop
 }
 #endif	!(defined(MODIFYPLUGIN) || defined(SHLUR))

 #ifndef	MODIFYPLUGIN

 #define	convolution(daddr, saddr, spitch, nullreg, bias_correction, reg0, reg1, reg2, reg3, reg4, reg5)	\
 __asm	SSE3_MOVE	reg0,				[saddr + spitch + 1]	\
 __asm	SSE_MOVE	reg1,				reg0					\
 __asm	punpcklbw	reg0,				nullreg					\
 __asm	punpckhbw	reg1,				nullreg					\
 __asm	SSE3_MOVE	reg2,				[saddr + spitch]		\
 __asm	paddusw		reg0,				reg0					\
 __asm	SSE_MOVE	reg3,				reg2					\
 __asm	paddusw		reg1,				reg1					\
 __asm	punpcklbw	reg2,				nullreg					\
 __asm	punpckhbw	reg3,				nullreg					\
 __asm	SSE3_MOVE	reg4,				[saddr + spitch + 2]	\
 __asm	paddusw		reg0,				reg2					\
 __asm	SSE_MOVE	reg5,				reg4					\
 __asm	paddusw		reg1,				reg3					\
 __asm	punpcklbw	reg4,				nullreg					\
 __asm	punpckhbw	reg5,				nullreg					\
 __asm	SSE3_MOVE	reg2,				[saddr + 1]				\
 __asm	paddusw		reg0,				reg4					\
 __asm	SSE_MOVE	reg3,				reg2					\
 __asm	paddusw		reg1,				reg5					\
 __asm	punpcklbw	reg2,				nullreg					\
 __asm	punpckhbw	reg3,				nullreg					\
 __asm	SSE3_MOVE	reg4,				[saddr + 2*spitch + 1]	\
 __asm	paddusw		reg0,				reg2					\
 __asm	SSE_MOVE	reg5,				reg4					\
 __asm	paddusw		reg1,				reg3					\
 __asm	punpcklbw	reg4,				nullreg					\
 __asm	punpckhbw	reg5,				nullreg					\
 __asm	SSE3_MOVE	reg2,				[saddr]					\
 __asm	paddusw		reg0,				reg4					\
 __asm	SSE_MOVE	reg3,				reg2					\
 __asm	paddusw		reg1,				reg5					\
 __asm	punpcklbw	reg2,				nullreg					\
 __asm	paddusw		reg0,				reg0					\
 __asm	paddusw		reg1,				reg1					\
 __asm	punpckhbw	reg3,				nullreg					\
 __asm	SSE3_MOVE	reg4,				[saddr + 2]				\
 __asm	paddusw		reg0,				reg2					\
 __asm	SSE_MOVE	reg5,				reg4					\
 __asm	paddusw		reg1,				reg3					\
 __asm	punpcklbw	reg4,				nullreg					\
 __asm	punpckhbw	reg5,				nullreg					\
 __asm	SSE3_MOVE	reg2,				[saddr + 2*spitch]		\
 __asm	paddusw		reg0,				reg4					\
 __asm	SSE_MOVE	reg3,				reg2					\
 __asm	paddusw		reg1,				reg5					\
 __asm	punpcklbw	reg2,				nullreg					\
 __asm	punpckhbw	reg3,				nullreg					\
 __asm	SSE3_MOVE	reg4,				[saddr + 2*spitch + 2]	\
 __asm	paddusw		reg0,				reg2					\
 __asm	SSE_MOVE	reg5,				reg4					\
 __asm	paddusw		reg1,				reg3					\
 __asm	punpcklbw	reg4,				nullreg					\
 __asm	punpckhbw	reg5,				nullreg					\
 __asm	paddusw		reg0,				reg4					\
 __asm	paddusw		reg1,				reg5					\
 __asm	paddusw		reg0,				bias_correction			\
 __asm	paddusw		reg1,				bias_correction			\
 __asm	psraw		reg0,				4						\
 __asm	psraw		reg1,				4						\
 __asm	packuswb	reg0,				reg1					\
 __asm	SSE_MOVE	[daddr],			reg0

 #define	convolution_w1(daddr, saddr, spitch, nullreg, bias_correction, reg0, reg1, reg2, reg3, reg4, reg5)	\
 __asm	SSE3_MOVE	reg0,				[saddr + spitch + 1]	\
 __asm	SSE_MOVE	reg1,				reg0					\
 __asm	punpcklbw	reg0,				nullreg					\
 __asm	punpckhbw	reg1,				nullreg					\
 __asm	SSE3_MOVE	reg2,				[saddr + spitch]		\
 __asm	paddusw		reg0,				reg0					\
 __asm	SSE_MOVE	reg3,				reg2					\
 __asm	movd		[daddr],			reg2					\
 __asm	paddusw		reg1,				reg1					\
 __asm	punpcklbw	reg2,				nullreg					\
 __asm	punpckhbw	reg3,				nullreg					\
 __asm	SSE3_MOVE	reg4,				[saddr + spitch + 2]	\
 __asm	paddusw		reg0,				reg2					\
 __asm	SSE_MOVE	reg5,				reg4					\
 __asm	paddusw		reg1,				reg3					\
 __asm	punpcklbw	reg4,				nullreg					\
 __asm	punpckhbw	reg5,				nullreg					\
 __asm	SSE3_MOVE	reg2,				[saddr + 1]				\
 __asm	paddusw		reg0,				reg4					\
 __asm	SSE_MOVE	reg3,				reg2					\
 __asm	paddusw		reg1,				reg5					\
 __asm	punpcklbw	reg2,				nullreg					\
 __asm	punpckhbw	reg3,				nullreg					\
 __asm	SSE3_MOVE	reg4,				[saddr + 2*spitch + 1]	\
 __asm	paddusw		reg0,				reg2					\
 __asm	SSE_MOVE	reg5,				reg4					\
 __asm	paddusw		reg1,				reg3					\
 __asm	punpcklbw	reg4,				nullreg					\
 __asm	punpckhbw	reg5,				nullreg					\
 __asm	SSE3_MOVE	reg2,				[saddr]					\
 __asm	paddusw		reg0,				reg4					\
 __asm	SSE_MOVE	reg3,				reg2					\
 __asm	paddusw		reg1,				reg5					\
 __asm	punpcklbw	reg2,				nullreg					\
 __asm	paddusw		reg0,				reg0					\
 __asm	paddusw		reg1,				reg1					\
 __asm	punpckhbw	reg3,				nullreg					\
 __asm	SSE3_MOVE	reg4,				[saddr + 2]				\
 __asm	paddusw		reg0,				reg2					\
 __asm	SSE_MOVE	reg5,				reg4					\
 __asm	paddusw		reg1,				reg3					\
 __asm	punpcklbw	reg4,				nullreg					\
 __asm	punpckhbw	reg5,				nullreg					\
 __asm	SSE3_MOVE	reg2,				[saddr + 2*spitch]		\
 __asm	paddusw		reg0,				reg4					\
 __asm	SSE_MOVE	reg3,				reg2					\
 __asm	paddusw		reg1,				reg5					\
 __asm	punpcklbw	reg2,				nullreg					\
 __asm	punpckhbw	reg3,				nullreg					\
 __asm	SSE3_MOVE	reg4,				[saddr + 2*spitch + 2]	\
 __asm	paddusw		reg0,				reg2					\
 __asm	SSE_MOVE	reg5,				reg4					\
 __asm	paddusw		reg1,				reg3					\
 __asm	punpcklbw	reg4,				nullreg					\
 __asm	punpckhbw	reg5,				nullreg					\
 __asm	paddusw		reg0,				reg4					\
 __asm	paddusw		reg1,				reg5					\
 __asm	paddusw		reg0,				bias_correction			\
 __asm	paddusw		reg1,				bias_correction			\
 __asm	psraw		reg0,				4						\
 __asm	psraw		reg1,				4						\
 __asm	packuswb	reg0,				reg1					\
 __asm	SSE_MOVE	[daddr + 1],		reg0


 #define	convolution_w2(daddr, saddr, spitch, nullreg, bias_correction, reg0, reg1, reg2, reg3, reg4, reg5)	\
 __asm	SSE3_MOVE	reg0,				[saddr + spitch + 1]	\
 __asm	SSE_MOVE	reg1,				reg0					\
 __asm	punpcklbw	reg0,				nullreg					\
 __asm	punpckhbw	reg1,				nullreg					\
 __asm	SSE3_MOVE	reg2,				[saddr + spitch]		\
 __asm	paddusw		reg0,				reg0					\
 __asm	SSE_MOVE	reg3,				reg2					\
 __asm	paddusw		reg1,				reg1					\
 __asm	punpcklbw	reg2,				nullreg					\
 __asm	punpckhbw	reg3,				nullreg					\
 __asm	SSE3_MOVE	reg4,				[saddr + spitch + 2]	\
 __asm	paddusw		reg0,				reg2					\
 __asm	SSE_MOVE	reg5,				reg4					\
 __asm	SSE_MOVE	[daddr + 1],		reg4					\
 __asm	paddusw		reg1,				reg3					\
 __asm	punpcklbw	reg4,				nullreg					\
 __asm	punpckhbw	reg5,				nullreg					\
 __asm	SSE3_MOVE	reg2,				[saddr + 1]				\
 __asm	paddusw		reg0,				reg4					\
 __asm	SSE_MOVE	reg3,				reg2					\
 __asm	paddusw		reg1,				reg5					\
 __asm	punpcklbw	reg2,				nullreg					\
 __asm	punpckhbw	reg3,				nullreg					\
 __asm	SSE3_MOVE	reg4,				[saddr + 2*spitch + 1]	\
 __asm	paddusw		reg0,				reg2					\
 __asm	SSE_MOVE	reg5,				reg4					\
 __asm	paddusw		reg1,				reg3					\
 __asm	punpcklbw	reg4,				nullreg					\
 __asm	punpckhbw	reg5,				nullreg					\
 __asm	SSE3_MOVE	reg2,				[saddr]					\
 __asm	paddusw		reg0,				reg4					\
 __asm	SSE_MOVE	reg3,				reg2					\
 __asm	paddusw		reg1,				reg5					\
 __asm	punpcklbw	reg2,				nullreg					\
 __asm	paddusw		reg0,				reg0					\
 __asm	paddusw		reg1,				reg1					\
 __asm	punpckhbw	reg3,				nullreg					\
 __asm	SSE3_MOVE	reg4,				[saddr + 2]				\
 __asm	paddusw		reg0,				reg2					\
 __asm	SSE_MOVE	reg5,				reg4					\
 __asm	paddusw		reg1,				reg3					\
 __asm	punpcklbw	reg4,				nullreg					\
 __asm	punpckhbw	reg5,				nullreg					\
 __asm	SSE3_MOVE	reg2,				[saddr + 2*spitch]		\
 __asm	paddusw		reg0,				reg4					\
 __asm	SSE_MOVE	reg3,				reg2					\
 __asm	paddusw		reg1,				reg5					\
 __asm	punpcklbw	reg2,				nullreg					\
 __asm	punpckhbw	reg3,				nullreg					\
 __asm	SSE3_MOVE	reg4,				[saddr + 2*spitch + 2]	\
 __asm	paddusw		reg0,				reg2					\
 __asm	SSE_MOVE	reg5,				reg4					\
 __asm	paddusw		reg1,				reg3					\
 __asm	punpcklbw	reg4,				nullreg					\
 __asm	punpckhbw	reg5,				nullreg					\
 __asm	paddusw		reg0,				reg4					\
 __asm	paddusw		reg1,				reg5					\
 __asm	paddusw		reg0,				bias_correction			\
 __asm	paddusw		reg1,				bias_correction			\
 __asm	psraw		reg0,				4						\
 __asm	psraw		reg1,				4						\
 __asm	packuswb	reg0,				reg1					\
 __asm	SSE_MOVE	[daddr],			reg0

 static	const __declspec(align(SSE_INCREMENT)) unsigned short	convolution_bias[SSE_INCREMENT/2] =
 	{
 		8,8,8,8
 #if	SSE_INCREMENT == 16
 		,8,8,8,8
 #endif
 	};

 void	SSE_RemoveGrain11(BYTE *dp, int dpitch, const BYTE *_sp, int spitch, int hblocks, int remainder, int incpitch, int height)
 {
 #ifdef	CVERSION
 		_sp -= spitch;
 		int width = (hblocks + 2) * SSE_INCREMENT + remainder;
 		int	spitch2 = spitch - width;
 		dpitch -= width;
 		do
 		{
 			int	w = width;
 			dp[0] = _sp[spitch];
 			do
 			{
 				*++dp = (2*(_sp[spitch] + 2 * _sp[spitch + 1] + _sp[spitch + 2] + _sp[1] + _sp[2 * spitch + 1])
 							+ _sp[0] + _sp[2] + _sp[2 * spitch] + _sp[2 * spitch  + 2] + 8) / 16;
 				++_sp;
 			} while( --w );
 			dp[1] = _sp[spitch + 1];
 			dp += dpitch;
 			_sp += spitch2; 
 		} while( --height );
 #else
 __asm	SSE_RMOVE	SSE7,				convolution_bias
 __asm	mov			eax,				hblocks
 __asm	mov			ebx,				spitch
 __asm	pxor		SSE6,				SSE6
 #ifdef	MODIFYPLUGIN
 __asm	mov			ecx,				eax
 #endif
 __asm	mov			edx,				remainder
 #if		SSE_INCREMENT == 16
 __asm	add			eax,				eax
 #endif
 __asm	mov			esi,				_sp
 #ifdef	MODIFYPLUGIN
 __asm	lea			eax,				[eax * 8 + edx]
 #else
 __asm	lea			eax,				[eax * 8 + edx + SSE_INCREMENT + 1]
 #endif
 __asm	sub			esi,				ebx
 __asm	sub			dpitch,				eax
 __asm	neg			eax
 __asm	mov			edi,				dp
 #ifdef	MODIFYPLUGIN
 __asm	inc			edi
 __asm	lea			eax,				[ebx + eax]	
 #else
 __asm	lea			eax,				[ebx + eax + 1]	
 __asm	align		16
 __asm	column_loop:
 		convolution_w1(edi, esi, ebx, SSE6, SSE7, SSE0, SSE1, SSE2, SSE3, SSE4, SSE5)
 // now the pixels in the middle
 __asm	add			esi,				SSE_INCREMENT
 __asm	add			edi,				SSE_INCREMENT + 1
 __asm	mov			ecx,				hblocks
 #endif	// MODIFYPLUGIN
 __asm	align		16
 __asm	middle_loop:
 		convolution(edi, esi, ebx, SSE6, SSE7, SSE0, SSE1, SSE2, SSE3, SSE4, SSE5)
 __asm	add			esi,				SSE_INCREMENT
 __asm	add			edi,				SSE_INCREMENT
 __asm	dec			ecx
 __asm	jnz			middle_loop
 // the last pixels
 __asm	add			esi,				edx
 __asm	add			edi,				edx
 		convolution_w2(edi, esi, ebx, SSE6, SSE7, SSE0, SSE1, SSE2, SSE3, SSE4, SSE5)
 __asm	add			esi,				eax
 __asm	add			edi,				dpitch
 __asm	dec			height
 #ifdef	MODIFYPLUGIN
 __asm	mov			ecx,				hblocks
 __asm	jnz			middle_loop
 #else
 __asm	jnz			column_loop
 #endif
 #endif
 }

 #define	flatconvolution(daddr, saddr, spitch, nullreg, onenineth, reg0, reg1, reg2, reg3, reg4, reg5)	\
 __asm	SSE3_MOVE	reg0,				[saddr]					\
 __asm	SSE3_MOVE	reg1,				[saddr + 1]				\
 __asm	SSE_RMOVE	reg2,				reg0					\
 __asm	SSE_RMOVE	reg3,				reg1					\
 __asm	punpcklbw	reg0,				nullreg					\
 __asm	punpcklbw	reg1,				nullreg					\
 __asm	punpckhbw	reg2,				nullreg					\
 __asm	punpckhbw	reg3,				nullreg					\
 __asm	paddusw		reg0,				reg1					\
 __asm	paddusw		reg2,				reg3					\
 __asm	SSE3_MOVE	reg4,				[saddr + 2]				\
 __asm	SSE3_MOVE	reg1,				[saddr + ebx]			\
 __asm	SSE_RMOVE	reg5,				reg4					\
 __asm	SSE_RMOVE	reg3,				reg1					\
 __asm	punpcklbw	reg4,				nullreg					\
 __asm	punpcklbw	reg1,				nullreg					\
 __asm	punpckhbw	reg5,				nullreg					\
 __asm	punpckhbw	reg3,				nullreg					\
 __asm	paddusw		reg0,				reg4					\
 __asm	paddusw		reg2,				reg5					\
 __asm	paddusw		reg0,				reg1					\
 __asm	paddusw		reg2,				reg3					\
 __asm	SSE3_MOVE	reg4,				[saddr + ebx + 1]		\
 __asm	SSE3_MOVE	reg1,				[saddr + ebx + 2]		\
 __asm	SSE_RMOVE	reg5,				reg4					\
 __asm	SSE_RMOVE	reg3,				reg1					\
 __asm	punpcklbw	reg4,				nullreg					\
 __asm	punpcklbw	reg1,				nullreg					\
 __asm	punpckhbw	reg5,				nullreg					\
 __asm	punpckhbw	reg3,				nullreg					\
 __asm	paddusw		reg0,				reg4					\
 __asm	paddusw		reg2,				reg5					\
 __asm	paddusw		reg0,				reg1					\
 __asm	paddusw		reg2,				reg3					\
 __asm	SSE3_MOVE	reg4,				[saddr + 2*ebx]			\
 __asm	SSE3_MOVE	reg1,				[saddr + 2*ebx + 1]		\
 __asm	SSE_RMOVE	reg5,				reg4					\
 __asm	SSE_RMOVE	reg3,				reg1					\
 __asm	punpcklbw	reg4,				nullreg					\
 __asm	punpcklbw	reg1,				nullreg					\
 __asm	punpckhbw	reg5,				nullreg					\
 __asm	punpckhbw	reg3,				nullreg					\
 __asm	paddusw		reg0,				reg4					\
 __asm	paddusw		reg2,				reg5					\
 __asm	paddusw		reg0,				reg1					\
 __asm	paddusw		reg2,				reg3					\
 __asm	SSE3_MOVE	reg1,				[saddr + 2*ebx + 2]		\
 __asm	SSE_RMOVE	reg3,				reg1					\
 __asm	punpcklbw	reg1,				nullreg					\
 __asm	punpckhbw	reg3,				nullreg					\
 __asm	paddusw		reg0,				reg1					\
 __asm	paddusw		reg2,				reg3					\
 __asm	paddusw		reg0,				flatconvolution_bias	\
 __asm	paddusw		reg2,				flatconvolution_bias	\
 __asm	pmulhuw		reg0,				onenineth				\
 __asm	pmulhuw		reg2,				onenineth				\
 __asm	packuswb	reg0,				reg2					\
 __asm	SSE_MOVE	[daddr],			reg0

 #define	flatconvolution_w1(daddr, saddr, spitch, nullreg, onenineth, reg0, reg1, reg2, reg3, reg4, reg5)	\
 __asm	SSE3_MOVE	reg0,				[saddr]					\
 __asm	SSE3_MOVE	reg1,				[saddr + 1]				\
 __asm	SSE_RMOVE	reg2,				reg0					\
 __asm	SSE_RMOVE	reg3,				reg1					\
 __asm	punpcklbw	reg0,				nullreg					\
 __asm	punpcklbw	reg1,				nullreg					\
 __asm	punpckhbw	reg2,				nullreg					\
 __asm	punpckhbw	reg3,				nullreg					\
 __asm	paddusw		reg0,				reg1					\
 __asm	paddusw		reg2,				reg3					\
 __asm	SSE3_MOVE	reg4,				[saddr + 2]				\
 __asm	SSE3_MOVE	reg1,				[saddr + ebx]			\
 __asm	SSE_RMOVE	reg5,				reg4					\
 __asm	movd		[daddr],			reg1					\
 __asm	SSE_RMOVE	reg3,				reg1					\
 __asm	punpcklbw	reg4,				nullreg					\
 __asm	punpcklbw	reg1,				nullreg					\
 __asm	punpckhbw	reg5,				nullreg					\
 __asm	punpckhbw	reg3,				nullreg					\
 __asm	paddusw		reg0,				reg4					\
 __asm	paddusw		reg2,				reg5					\
 __asm	paddusw		reg0,				reg1					\
 __asm	paddusw		reg2,				reg3					\
 __asm	SSE3_MOVE	reg4,				[saddr + ebx + 1]		\
 __asm	SSE3_MOVE	reg1,				[saddr + ebx + 2]		\
 __asm	SSE_RMOVE	reg5,				reg4					\
 __asm	SSE_RMOVE	reg3,				reg1					\
 __asm	punpcklbw	reg4,				nullreg					\
 __asm	punpcklbw	reg1,				nullreg					\
 __asm	punpckhbw	reg5,				nullreg					\
 __asm	punpckhbw	reg3,				nullreg					\
 __asm	paddusw		reg0,				reg4					\
 __asm	paddusw		reg2,				reg5					\
 __asm	paddusw		reg0,				reg1					\
 __asm	paddusw		reg2,				reg3					\
 __asm	SSE3_MOVE	reg4,				[saddr + 2*ebx]			\
 __asm	SSE3_MOVE	reg1,				[saddr + 2*ebx + 1]		\
 __asm	SSE_RMOVE	reg5,				reg4					\
 __asm	SSE_RMOVE	reg3,				reg1					\
 __asm	punpcklbw	reg4,				nullreg					\
 __asm	punpcklbw	reg1,				nullreg					\
 __asm	punpckhbw	reg5,				nullreg					\
 __asm	punpckhbw	reg3,				nullreg					\
 __asm	paddusw		reg0,				reg4					\
 __asm	paddusw		reg2,				reg5					\
 __asm	paddusw		reg0,				reg1					\
 __asm	paddusw		reg2,				reg3					\
 __asm	SSE3_MOVE	reg1,				[saddr + 2*ebx + 2]		\
 __asm	SSE_RMOVE	reg3,				reg1					\
 __asm	punpcklbw	reg1,				nullreg					\
 __asm	punpckhbw	reg3,				nullreg					\
 __asm	paddusw		reg0,				reg1					\
 __asm	paddusw		reg2,				reg3					\
 __asm	paddusw		reg0,				flatconvolution_bias	\
 __asm	paddusw		reg2,				flatconvolution_bias	\
 __asm	pmulhuw		reg0,				onenineth				\
 __asm	pmulhuw		reg2,				onenineth				\
 __asm	packuswb	reg0,				reg2					\
 __asm	SSE_MOVE	[daddr + 1],		reg0

 #define	flatconvolution_w2(daddr, saddr, spitch, nullreg, onenineth, reg0, reg1, reg2, reg3, reg4, reg5)	\
 __asm	SSE3_MOVE	reg0,				[saddr]					\
 __asm	SSE3_MOVE	reg1,				[saddr + 1]				\
 __asm	SSE_RMOVE	reg2,				reg0					\
 __asm	SSE_RMOVE	reg3,				reg1					\
 __asm	punpcklbw	reg0,				nullreg					\
 __asm	punpcklbw	reg1,				nullreg					\
 __asm	punpckhbw	reg2,				nullreg					\
 __asm	punpckhbw	reg3,				nullreg					\
 __asm	paddusw		reg0,				reg1					\
 __asm	paddusw		reg2,				reg3					\
 __asm	SSE3_MOVE	reg4,				[saddr + 2]				\
 __asm	SSE3_MOVE	reg1,				[saddr + ebx]			\
 __asm	SSE_RMOVE	reg5,				reg4					\
 __asm	SSE_RMOVE	reg3,				reg1					\
 __asm	punpcklbw	reg4,				nullreg					\
 __asm	punpcklbw	reg1,				nullreg					\
 __asm	punpckhbw	reg5,				nullreg					\
 __asm	punpckhbw	reg3,				nullreg					\
 __asm	paddusw		reg0,				reg4					\
 __asm	paddusw		reg2,				reg5					\
 __asm	paddusw		reg0,				reg1					\
 __asm	paddusw		reg2,				reg3					\
 __asm	SSE3_MOVE	reg4,				[saddr + ebx + 1]		\
 __asm	SSE3_MOVE	reg1,				[saddr + ebx + 2]		\
 __asm	SSE_RMOVE	reg5,				reg4					\
 __asm	SSE_MOVE	[daddr + 1],		reg1					\
 __asm	SSE_RMOVE	reg3,				reg1					\
 __asm	punpcklbw	reg4,				nullreg					\
 __asm	punpcklbw	reg1,				nullreg					\
 __asm	punpckhbw	reg5,				nullreg					\
 __asm	punpckhbw	reg3,				nullreg					\
 __asm	paddusw		reg0,				reg4					\
 __asm	paddusw		reg2,				reg5					\
 __asm	paddusw		reg0,				reg1					\
 __asm	paddusw		reg2,				reg3					\
 __asm	SSE3_MOVE	reg4,				[saddr + 2*ebx]			\
 __asm	SSE3_MOVE	reg1,				[saddr + 2*ebx + 1]		\
 __asm	SSE_RMOVE	reg5,				reg4					\
 __asm	SSE_RMOVE	reg3,				reg1					\
 __asm	punpcklbw	reg4,				nullreg					\
 __asm	punpcklbw	reg1,				nullreg					\
 __asm	punpckhbw	reg5,				nullreg					\
 __asm	punpckhbw	reg3,				nullreg					\
 __asm	paddusw		reg0,				reg4					\
 __asm	paddusw		reg2,				reg5					\
 __asm	paddusw		reg0,				reg1					\
 __asm	paddusw		reg2,				reg3					\
 __asm	SSE3_MOVE	reg1,				[saddr + 2*ebx + 2]		\
 __asm	SSE_RMOVE	reg3,				reg1					\
 __asm	punpcklbw	reg1,				nullreg					\
 __asm	punpckhbw	reg3,				nullreg					\
 __asm	paddusw		reg0,				reg1					\
 __asm	paddusw		reg2,				reg3					\
 __asm	paddusw		reg0,				flatconvolution_bias	\
 __asm	paddusw		reg2,				flatconvolution_bias	\
 __asm	pmulhuw		reg0,				onenineth				\
 __asm	pmulhuw		reg2,				onenineth				\
 __asm	packuswb	reg0,				reg2					\
 __asm	SSE_MOVE	[daddr],			reg0

 #define	FLATBIAS	4

 static	const __declspec(align(SSE_INCREMENT)) unsigned short	flatconvolution_bias[SSE_INCREMENT/2] =
 	{
 		FLATBIAS, FLATBIAS, FLATBIAS, FLATBIAS	
 #if	SSE_INCREMENT == 16
 		, FLATBIAS, FLATBIAS, FLATBIAS, FLATBIAS	
 #endif
 	};

 #define	ONENINETH	(unsigned short)(((1u << 16) + 4) / 9)

 static	const __declspec(align(SSE_INCREMENT)) unsigned short	onenineth[SSE_INCREMENT/2] =
 	{
 		ONENINETH, ONENINETH, ONENINETH, ONENINETH	
 #if	SSE_INCREMENT == 16
 		, ONENINETH, ONENINETH, ONENINETH, ONENINETH	
 #endif
 	};

 void	SSE_RemoveGrain20(BYTE *dp, int dpitch, const BYTE *_sp, int spitch, int hblocks, int remainder, int incpitch, int height)
 {
 #ifdef	CVERSION
 		_sp -= spitch;
 		int width = (hblocks + 2) * SSE_INCREMENT + remainder;
 		int	spitch2 = spitch - width;
 		dpitch -= width;
 		do
 		{
 			int	w = width;
 			dp[0] = _sp[spitch];
 			do
 			{
 				*++dp = (BYTE)((_sp[0] + _sp[1] + _sp[2] + _sp[spitch] + _sp[spitch + 1] + _sp[spitch + 2] + _sp[2 * spitch]
 							+ _sp[2 * spitch + 1] + _sp[2 * spitch  + 2] + 4) / 9);
 				++_sp;
 			} while( --w );
 			dp[1] = _sp[spitch + 1];
 			dp += dpitch;
 			_sp += spitch2; 
 		} while( --height );
 #else
 __asm	SSE_RMOVE	SSE7,				onenineth
 __asm	mov			eax,				hblocks
 __asm	mov			ebx,				spitch
 __asm	pxor		SSE6,				SSE6
 #ifdef	MODIFYPLUGIN
 __asm	mov			ecx,				eax
 #endif
 __asm	mov			edx,				remainder
 #if		SSE_INCREMENT == 16
 __asm	add			eax,				eax
 #endif
 __asm	mov			esi,				_sp
 #ifdef	MODIFYPLUGIN
 __asm	lea			eax,				[eax * 8 + edx]
 #else
 __asm	lea			eax,				[eax * 8 + edx + SSE_INCREMENT + 1]
 #endif
 __asm	sub			esi,				ebx
 __asm	sub			dpitch,				eax
 __asm	neg			eax
 __asm	mov			edi,				dp
 #ifdef	MODIFYPLUGIN
 __asm	inc			edi
 __asm	lea			eax,				[ebx + eax]	
 #else
 __asm	lea			eax,				[ebx + eax + 1]	
 __asm	align		16
 __asm	column_loop:
 		flatconvolution_w1(edi, esi, ebx, SSE6, SSE7, SSE0, SSE1, SSE2, SSE3, SSE4, SSE5)
 // now the pixels in the middle
 __asm	add			esi,				SSE_INCREMENT
 __asm	add			edi,				SSE_INCREMENT + 1
 __asm	mov			ecx,				hblocks
 #endif	// MODIFYPLUGIN
 __asm	align		16
 __asm	middle_loop:
 		flatconvolution(edi, esi, ebx, SSE6, SSE7, SSE0, SSE1, SSE2, SSE3, SSE4, SSE5)
 __asm	add			esi,				SSE_INCREMENT
 __asm	add			edi,				SSE_INCREMENT
 __asm	dec			ecx
 __asm	jnz			middle_loop
 // the last pixels
 __asm	add			esi,				edx
 __asm	add			edi,				edx
 		flatconvolution_w2(edi, esi, ebx, SSE6, SSE7, SSE0, SSE1, SSE2, SSE3, SSE4, SSE5)
 __asm	add			esi,				eax
 __asm	add			edi,				dpitch
 __asm	dec			height
 #ifdef	MODIFYPLUGIN
 __asm	mov			ecx,				hblocks
 __asm	jnz			middle_loop
 #else
 __asm	jnz			column_loop
 #endif
 #endif
 }

 #if	ISSE > 1
 #define	fconvolution(daddr, saddr, spitch, nullreg, bias_correction, reg0, reg1, reg2, reg3, reg4, reg5)\
 __asm	SSE3_MOVE	reg0,				[saddr]				\
 __asm	SSE3_MOVE	reg1,				[saddr + 2]			\
 __asm	SSE3_MOVE	reg2,				[saddr + 2*spitch]	\
 __asm	SSE3_MOVE	reg3,				[saddr + 2*spitch + 2]\
 __asm	pavgb		reg0,				reg1				\
 __asm	pavgb		reg2,				reg3				\
 __asm	SSE3_MOVE	reg1,				[saddr + 1]			\
 __asm	SSE3_MOVE	reg3,				[saddr + 2*spitch + 1]\
 __asm	pavgb		reg0,				reg1				\
 __asm	SSE3_MOVE	reg4,				[saddr + spitch]	\
 __asm	SSE3_MOVE	reg5,				[saddr + spitch + 2]\
 __asm	pavgb		reg2,				reg3				\
 __asm	SSE3_MOVE	reg1,				[saddr + spitch + 1]\
 __asm	pavgb		reg0,				reg2				\
 __asm	pavgb		reg4,				reg5				\
 __asm	psubusb		reg0,				bias_correction		\
 __asm	pavgb		reg1,				reg4				\
 __asm	pavgb		reg0,				reg1				\
 __asm	SSE_MOVE	[daddr],			reg0

 #define	fconvolution_w1(daddr, saddr, spitch, nullreg, bias_correction, reg0, reg1, reg2, reg3, reg4, reg5)\
 __asm	SSE3_MOVE	reg0,				[saddr]				\
 __asm	SSE3_MOVE	reg1,				[saddr + 2]			\
 __asm	SSE3_MOVE	reg2,				[saddr + 2*spitch]	\
 __asm	SSE3_MOVE	reg3,				[saddr + 2*spitch + 2]\
 __asm	pavgb		reg0,				reg1				\
 __asm	pavgb		reg2,				reg3				\
 __asm	SSE3_MOVE	reg1,				[saddr + 1]			\
 __asm	SSE3_MOVE	reg3,				[saddr + 2*spitch + 1]\
 __asm	pavgb		reg0,				reg1				\
 __asm	SSE3_MOVE	reg4,				[saddr + spitch]	\
 __asm	SSE3_MOVE	reg5,				[saddr + spitch + 2]\
 __asm	pavgb		reg2,				reg3				\
 __asm	movd		[daddr],			reg4				\
 __asm	SSE3_MOVE	reg1,				[saddr + spitch + 1]\
 __asm	pavgb		reg0,				reg2				\
 __asm	pavgb		reg4,				reg5				\
 __asm	psubusb		reg0,				bias_correction		\
 __asm	pavgb		reg1,				reg4				\
 __asm	pavgb		reg0,				reg1				\
 __asm	SSE_MOVE	[daddr + 1],		reg0

 #define	fconvolution_w2(daddr, saddr, spitch, nullreg, bias_correction, reg0, reg1, reg2, reg3, reg4, reg5)\
 __asm	SSE3_MOVE	reg0,				[saddr]				\
 __asm	SSE3_MOVE	reg1,				[saddr + 2]			\
 __asm	SSE3_MOVE	reg2,				[saddr + 2*spitch]	\
 __asm	SSE3_MOVE	reg3,				[saddr + 2*spitch + 2]\
 __asm	pavgb		reg0,				reg1				\
 __asm	pavgb		reg2,				reg3				\
 __asm	SSE3_MOVE	reg1,				[saddr + 1]			\
 __asm	SSE3_MOVE	reg3,				[saddr + 2*spitch + 1]\
 __asm	pavgb		reg0,				reg1				\
 __asm	SSE3_MOVE	reg4,				[saddr + spitch]	\
 __asm	SSE3_MOVE	reg5,				[saddr + spitch + 2]\
 __asm	pavgb		reg2,				reg3				\
 __asm	SSE3_MOVE	reg1,				[saddr + spitch + 1]\
 __asm	pavgb		reg0,				reg2				\
 __asm	pavgb		reg4,				reg5				\
 __asm	psubusb		reg0,				bias_correction		\
 __asm	pavgb		reg1,				reg4				\
 __asm	SSE_MOVE	[daddr + 1],		reg5				\
 __asm	pavgb		reg0,				reg1				\
 __asm	SSE_MOVE	[daddr],			reg0
 #else
 #define	fconvolution(daddr, saddr, spitch, nullreg, bias_correction, reg0, reg1, reg2, reg3, reg4, reg5)	\
 __asm	SSE3_MOVE	reg0,				[saddr]				\
 __asm	SSE3_MOVE	reg2,				[saddr + 2*spitch]	\
 __asm	pavgb		reg0,				[saddr + 2]			\
 __asm	pavgb		reg2,				[saddr + 2*spitch + 2]\
 __asm	pavgb		reg0,				[saddr + 1]			\
 __asm	pavgb		reg2,				[saddr + 2*spitch + 1]\
 __asm	SSE3_MOVE	reg4,				[saddr + spitch]	\
 __asm	pavgb		reg0,				reg2				\
 __asm	pavgb		reg4,				[saddr + spitch + 2]\
 __asm	psubusb		reg0,				bias_correction		\
 __asm	pavgb		reg4,				[saddr + spitch + 1]\
 __asm	pavgb		reg0,				reg4				\
 __asm	SSE_MOVE	[daddr],			reg0


 #define	fconvolution_w1(daddr, saddr, spitch, nullreg, bias_correction, reg0, reg1, reg2, reg3, reg4, reg5)	\
 __asm	SSE3_MOVE	reg0,				[saddr]				\
 __asm	SSE3_MOVE	reg2,				[saddr + 2*spitch]	\
 __asm	pavgb		reg0,				[saddr + 2]			\
 __asm	pavgb		reg2,				[saddr + 2*spitch + 2]\
 __asm	pavgb		reg0,				[saddr + 1]			\
 __asm	pavgb		reg2,				[saddr + 2*spitch + 1]\
 __asm	SSE3_MOVE	reg4,				[saddr + spitch]	\
 __asm	pavgb		reg0,				reg2				\
 __asm	movd		[daddr],			reg4				\
 __asm	pavgb		reg4,				[saddr + spitch + 2]\
 __asm	psubusb		reg0,				bias_correction		\
 __asm	pavgb		reg4,				[saddr + spitch + 1]\
 __asm	pavgb		reg0,				reg4				\
 __asm	SSE_MOVE	[daddr + 1],		reg0

 #define	fconvolution_w2(daddr, saddr, spitch, nullreg, bias_correction, reg0, reg1, reg2, reg3, reg4, reg5)	\
 __asm	SSE3_MOVE	reg0,				[saddr]				\
 __asm	SSE3_MOVE	reg2,				[saddr + 2*spitch]	\
 __asm	pavgb		reg0,				[saddr + 2]			\
 __asm	pavgb		reg2,				[saddr + 2*spitch + 2]\
 __asm	pavgb		reg0,				[saddr + 1]			\
 __asm	pavgb		reg2,				[saddr + 2*spitch + 1]\
 __asm	SSE3_MOVE	reg4,				[saddr + spitch + 2]\
 __asm	pavgb		reg0,				reg2				\
 __asm	SSE_MOVE	[daddr + 1],		reg4				\
 __asm	pavgb		reg4,				[saddr + spitch]	\
 __asm	psubusb		reg0,				bias_correction		\
 __asm	pavgb		reg4,				[saddr + spitch + 1]\
 __asm	pavgb		reg0,				reg4				\
 __asm	SSE_MOVE	[daddr],			reg0

 #endif // ISSE

 static	const __declspec(align(SSE_INCREMENT)) unsigned char fconvolution_bias[SSE_INCREMENT] =
 	{
 		1,1,1,1,1,1,1,1
 #if	SSE_INCREMENT == 16
 		,1,1,1,1,1,1,1,1
 #endif
 	};

 void	SSE_RemoveGrain12(BYTE *dp, int dpitch, const BYTE *_sp, int spitch, int hblocks, int remainder, int incpitch, int height)
 {
 #ifdef	CVERSION
 		_sp -= spitch;
 		int width = (hblocks + 2) * SSE_INCREMENT + remainder;
 		int	spitch2 = spitch - width;
 		dpitch -= width;
 		do
 		{
 			int	w = width;
 			dp[0] = _sp[spitch];
 			do
 			{
 				*++dp = ((((_sp[0] + _sp[2] + 1) / 2 + _sp[1] + 1) / 2 + ((_sp[2*spitch] + _sp[2*spitch + 2] + 1) / 2 + _sp[2*spitch + 1] + 1) / 2 + 1)/2
 					 + ((_sp[spitch] + _sp[spitch + 2] + 1) / 2 + _sp[spitch + 1] + 1) / 2) / 2;
 				++_sp;
 			} while( --w );
 			dp[1] = _sp[spitch + 1];
 			dp += dpitch;
 			_sp += spitch2; 
 		} while( --height );
 #else
 __asm	SSE_RMOVE	SSE7,				fconvolution_bias
 __asm	mov			eax,				hblocks
 __asm	mov			ebx,				spitch
 __asm	pxor		SSE6,				SSE6
 #ifdef	MODIFYPLUGIN
 __asm	mov			ecx,				eax
 #endif
 __asm	mov			edx,				remainder
 #if		SSE_INCREMENT == 16
 __asm	add			eax,				eax
 #endif
 __asm	mov			esi,				_sp
 #ifdef	MODIFYPLUGIN
 __asm	lea			eax,				[eax * 8 + edx]
 #else
 __asm	lea			eax,				[eax * 8 + edx + SSE_INCREMENT + 1]
 #endif
 __asm	sub			esi,				ebx
 __asm	sub			dpitch,				eax
 __asm	neg			eax
 __asm	mov			edi,				dp
 #ifdef	MODIFYPLUGIN
 __asm	inc			edi
 __asm	lea			eax,				[ebx + eax]	
 #else
 __asm	lea			eax,				[ebx + eax + 1]	
 __asm	align		16
 __asm	column_loop:
 		fconvolution_w1(edi, esi, ebx, SSE6, SSE7, SSE0, SSE1, SSE2, SSE3, SSE4, SSE5)
 // now the pixels in the middle
 __asm	add			esi,				SSE_INCREMENT
 __asm	add			edi,				SSE_INCREMENT + 1
 __asm	mov			ecx,				hblocks
 #endif	// MODIFYPLUGIN
 __asm	align		16
 __asm	middle_loop:
 		fconvolution(edi, esi, ebx, SSE6, SSE7, SSE0, SSE1, SSE2, SSE3, SSE4, SSE5)
 __asm	add			esi,				SSE_INCREMENT
 __asm	add			edi,				SSE_INCREMENT
 __asm	dec			ecx
 __asm	jnz			middle_loop
 // the last pixels
 __asm	add			esi,				edx
 __asm	add			edi,				edx
 		fconvolution_w2(edi, esi, ebx, SSE6, SSE7, SSE0, SSE1, SSE2, SSE3, SSE4, SSE5)
 __asm	add			esi,				eax
 __asm	add			edi,				dpitch
 __asm	dec			height
 #ifdef	MODIFYPLUGIN
 __asm	mov			ecx,				hblocks
 __asm	jnz			middle_loop
 #else
 __asm	jnz			column_loop
 #endif
 #endif
 }

 #if	ISSE > 1
 #define	rconvolution(daddr, saddr, spitch, nullreg, bias_correction, reg0, reg1, reg2, reg3, reg4, reg5)	\
 __asm	SSE3_MOVE	reg0,				[saddr]				\
 __asm	SSE3_MOVE	reg2,				[saddr + 2]			\
 __asm	SSE3_MOVE	reg1,				[saddr + 2*spitch]	\
 __asm	SSE3_MOVE	reg3,				[saddr + 2*spitch + 2]\
 __asm	pavgb		reg0,				reg2				\
 __asm	pavgb		reg1,				reg3				\
 __asm	SSE3_MOVE	reg2,				[saddr + spitch]	\
 __asm	SSE3_MOVE	reg3,				[saddr + 1]			\
 __asm	pavgb		reg0,				reg1				\
 __asm	SSE3_MOVE	reg4,				[saddr + 2*spitch + 1]\
 __asm	SSE3_MOVE	reg5,				[saddr + spitch + 2]\
 __asm	pavgb		reg2,				reg4				\
 __asm	pavgb		reg3,				reg5				\
 __asm	psubusb		reg0,				bias_correction		\
 __asm	pavgb		reg2,				reg3				\
 __asm	pavgb		reg0,				reg2				\
 __asm	SSE_MOVE	[daddr],			reg0

 #define	rconvolution_w1(daddr, saddr, spitch, nullreg, bias_correction, reg0, reg1, reg2, reg3, reg4, reg5)	\
 __asm	SSE3_MOVE	reg0,				[saddr]				\
 __asm	SSE3_MOVE	reg2,				[saddr + 2]			\
 __asm	SSE3_MOVE	reg1,				[saddr + 2*spitch]	\
 __asm	SSE3_MOVE	reg3,				[saddr + 2*spitch + 2]\
 __asm	pavgb		reg0,				reg2				\
 __asm	pavgb		reg1,				reg3				\
 __asm	SSE3_MOVE	reg2,				[saddr + spitch]	\
 __asm	SSE3_MOVE	reg3,				[saddr + 1]			\
 __asm	pavgb		reg0,				reg1				\
 __asm	movd		[daddr],			reg2				\
 __asm	SSE3_MOVE	reg4,				[saddr + 2*spitch + 1]\
 __asm	SSE3_MOVE	reg5,				[saddr + spitch + 2]\
 __asm	pavgb		reg2,				reg4				\
 __asm	pavgb		reg3,				reg5				\
 __asm	psubusb		reg0,				bias_correction		\
 __asm	pavgb		reg2,				reg3				\
 __asm	pavgb		reg0,				reg2				\
 __asm	SSE_MOVE	[daddr + 1],		reg0

 #define	rconvolution_w2(daddr, saddr, spitch, nullreg, bias_correction, reg0, reg1, reg2, reg3, reg4, reg5)	\
 __asm	SSE3_MOVE	reg0,				[saddr]				\
 __asm	SSE3_MOVE	reg2,				[saddr + 2]			\
 __asm	SSE3_MOVE	reg1,				[saddr + 2*spitch]	\
 __asm	SSE3_MOVE	reg3,				[saddr + 2*spitch + 2]\
 __asm	pavgb		reg0,				reg2				\
 __asm	pavgb		reg1,				reg3				\
 __asm	SSE3_MOVE	reg2,				[saddr + spitch]	\
 __asm	SSE3_MOVE	reg3,				[saddr + 1]			\
 __asm	pavgb		reg0,				reg1				\
 __asm	SSE3_MOVE	reg4,				[saddr + 2*spitch + 1]\
 __asm	SSE3_MOVE	reg5,				[saddr + spitch + 2]\
 __asm	pavgb		reg2,				reg4				\
 __asm	pavgb		reg3,				reg5				\
 __asm	SSE_MOVE	[daddr + 1],		reg5				\
 __asm	pavgb		reg2,				reg3				\
 __asm	psubusb		reg0,				bias_correction		\
 __asm	pavgb		reg0,				reg2				\
 __asm	SSE_MOVE	[daddr],			reg0
 #else
 #define	rconvolution(daddr, saddr, spitch, nullreg, bias_correction, reg0, reg1, reg2, reg3, reg4, reg5)\
 __asm	SSE3_MOVE	reg0,				[saddr]				\
 __asm	SSE3_MOVE	reg1,				[saddr + 2*spitch]	\
 __asm	pavgb		reg0,				[saddr + 2]			\
 __asm	pavgb		reg1,				[saddr + 2*spitch + 2]\
 __asm	SSE3_MOVE	reg2,				[saddr + spitch]	\
 __asm	SSE3_MOVE	reg3,				[saddr + 1]			\
 __asm	pavgb		reg0,				reg1				\
 __asm	pavgb		reg2,				[saddr + 2*spitch + 1]\
 __asm	pavgb		reg3,				[saddr + spitch + 2]\
 __asm	psubusb		reg0,				bias_correction		\
 __asm	pavgb		reg2,				reg3				\
 __asm	pavgb		reg0,				reg2				\
 __asm	SSE_MOVE	[daddr],			reg0

 #define	rconvolution_w1(daddr, saddr, spitch, nullreg, bias_correction, reg0, reg1, reg2, reg3, reg4, reg5)\
 __asm	SSE3_MOVE	reg0,				[saddr]				\
 __asm	SSE3_MOVE	reg1,				[saddr + 2*spitch]	\
 __asm	pavgb		reg0,				[saddr + 2]			\
 __asm	pavgb		reg1,				[saddr + 2*spitch + 2]\
 __asm	SSE3_MOVE	reg2,				[saddr + spitch]	\
 __asm	SSE3_MOVE	reg3,				[saddr + 1]			\
 __asm	movd		[daddr],			reg2				\
 __asm	pavgb		reg0,				reg1				\
 __asm	pavgb		reg2,				[saddr + 2*spitch + 1]\
 __asm	pavgb		reg3,				[saddr + spitch + 2]\
 __asm	psubusb		reg0,				bias_correction		\
 __asm	pavgb		reg2,				reg3				\
 __asm	pavgb		reg0,				reg2				\
 __asm	SSE_MOVE	[daddr + 1],		reg0

 #define	rconvolution_w2(daddr, saddr, spitch, nullreg, bias_correction, reg0, reg1, reg2, reg3, reg4, reg5)\
 __asm	SSE3_MOVE	reg0,				[saddr]				\
 __asm	SSE3_MOVE	reg1,				[saddr + 2*spitch]	\
 __asm	pavgb		reg0,				[saddr + 2]			\
 __asm	pavgb		reg1,				[saddr + 2*spitch + 2]\
 __asm	SSE3_MOVE	reg2,				[saddr + spitch]	\
 __asm	SSE3_MOVE	reg3,				[saddr + spitch + 2]\
 __asm	pavgb		reg0,				reg1				\
 __asm	SSE_MOVE	[daddr + 1],		reg3				\
 __asm	pavgb		reg2,				[saddr + 2*spitch + 1]\
 __asm	pavgb		reg3,				[saddr + 1]			\
 __asm	psubusb		reg0,				bias_correction		\
 __asm	pavgb		reg2,				reg3				\
 __asm	pavgb		reg0,				reg2				\
 __asm	SSE_MOVE	[daddr],			reg0
 #endif

 #define	rbias	1

 static	const __declspec(align(SSE_INCREMENT)) unsigned char rconvolution_bias[SSE_INCREMENT] =
 	{
 		rbias, rbias, rbias, rbias, rbias, rbias, rbias, rbias
 #if	SSE_INCREMENT == 16
 		, rbias, rbias, rbias, rbias, rbias, rbias, rbias, rbias
 #endif
 	};

 void	SSE_RemoveGrain19(BYTE *dp, int dpitch, const BYTE *_sp, int spitch, int hblocks, int remainder, int incpitch, int height)
 {
 #ifdef	CVERSION
 		_sp -= spitch;
 		int width = (hblocks + 2) * SSE_INCREMENT + remainder;
 		int	spitch2 = spitch - width;
 		dpitch -= width;
 		do
 		{
 			int	w = width;
 			dp[0] = _sp[spitch];
 			do
 			{
 				*++dp = (_sp[0] + _sp[1] + _sp[2] + _sp[spitch] + _sp[spitch + 2] + _sp[2*spitch] + _sp[2*spitch + 1] + _sp[2*spitch + 2] + 4) / 8;
 				++_sp;
 			} while( --w );
 			dp[1] = _sp[spitch + 1];
 			dp += dpitch;
 			_sp += spitch2; 
 		} while( --height );
 #else
 __asm	SSE_RMOVE	SSE7,				rconvolution_bias
 __asm	mov			eax,				hblocks
 __asm	mov			ebx,				spitch
 __asm	pxor		SSE6,				SSE6
 __asm	mov			edx,				remainder
 #if		SSE_INCREMENT == 16
 __asm	add			eax,				eax
 #endif
 __asm	mov			esi,				_sp
 __asm	lea			eax,				[eax * 8 + edx + SSE_INCREMENT + 1]
 __asm	sub			esi,				ebx
 __asm	sub			dpitch,				eax
 __asm	neg			eax
 __asm	mov			edi,				dp
 __asm	lea			eax,				[ebx + eax + 1]	
 __asm	align		16
 __asm	column_loop:
 		rconvolution_w1(edi, esi, ebx, SSE6, SSE7, SSE0, SSE1, SSE2, SSE3, SSE4, SSE5)
 // now the pixels in the middle
 __asm	add			esi,				SSE_INCREMENT
 __asm	add			edi,				SSE_INCREMENT + 1
 __asm	mov			ecx,				hblocks
 __asm	align		16
 __asm	middle_loop:
 		rconvolution(edi, esi, ebx, SSE6, SSE7, SSE0, SSE1, SSE2, SSE3, SSE4, SSE5)
 __asm	add			esi,				SSE_INCREMENT
 __asm	add			edi,				SSE_INCREMENT
 __asm	dec			ecx
 __asm	jnz			middle_loop
 // the last pixels
 __asm	add			esi,				edx
 __asm	add			edi,				edx
 		rconvolution_w2(edi, esi, ebx, SSE6, SSE7, SSE0, SSE1, SSE2, SSE3, SSE4, SSE5)	
 __asm	add			esi,				eax
 __asm	add			edi,				dpitch
 __asm	dec			height
 __asm	jnz			column_loop
 #endif
 }

 #define	DeringA(ldiff, udiff, center, p1, p2, reg1, reg2) \
 __asm	SSE3_MOVE	reg2,				p1		\
 __asm	SSE3_MOVE	reg1,				p2		\
 __asm	SSE_RMOVE	ldiff,				reg2	\
 __asm	SSE_RMOVE	udiff,				center	\
 __asm	pmaxub		reg2,				reg1	\
 __asm	pminub		ldiff,				reg1	\
 __asm	psubusb		udiff,				reg2	\
 __asm	psubusb		reg2,				ldiff	\
 __asm	psubusb		ldiff,				center	\
 __asm	pminub		udiff,				reg2	\
 __asm	pminub		ldiff,				reg2

 #define	DeringA_w(ldiff, udiff, center, p1, p2, mwrite, wmem, reg1, reg2) \
 __asm	SSE3_MOVE	reg2,				p1		\
 __asm	SSE3_MOVE	reg1,				p2		\
 __asm	SSE_RMOVE	ldiff,				reg2	\
 __asm	SSE_RMOVE	udiff,				center	\
 __asm	pmaxub		reg2,				reg1	\
 __asm	pminub		ldiff,				reg1	\
 __asm	psubusb		udiff,				reg2	\
 __asm	mwrite		wmem,				reg1	\
 __asm	psubusb		reg2,				ldiff	\
 __asm	psubusb		ldiff,				center	\
 __asm	pminub		udiff,				reg2	\
 __asm	pminub		ldiff,				reg2

 #define	DeringM(ldiff, udiff, center, reg1)	\
 __asm	psubusb		center,				udiff	\
 __asm	paddusb		center,				ldiff
 			

 void	SSE_RemoveGrain23(BYTE *dp, int dpitch, const BYTE *_sp, int spitch, int hblocks, int remainder, int incpitch, int height)
 {
 __asm	mov			eax,				hblocks
 __asm	mov			ebx,				spitch
 __asm	mov			edx,				remainder
 #if		SSE_INCREMENT == 16
 __asm	add			eax,				eax
 #endif
 __asm	mov			esi,				_sp
 __asm	lea			eax,				[eax * 8 + edx + SSE_INCREMENT + 1]
 __asm	sub			esi,				ebx
 __asm	sub			dpitch,				eax
 __asm	neg			eax
 __asm	mov			edi,				dp
 __asm	lea			eax,				[ebx + eax + 1]	
 __asm	align		16
 __asm	column_loop:
 __asm	SSE3_MOVE	SSE0,				[esi + ebx + 1]
 		DeringA(SSE1, SSE2, SSE0, [esi], [esi + 2*ebx + 2], SSE5, SSE6)
 		DeringA(SSE3, SSE4, SSE0, [esi + 2], [esi + 2*ebx], SSE5, SSE6)
 __asm	pmaxub		SSE2,				SSE4
 __asm	pmaxub		SSE1,				SSE3
 		DeringA_w(SSE3, SSE4, SSE0, [esi + ebx + 2], [esi + ebx], movd, [edi], SSE5, SSE6)
 __asm	pmaxub		SSE2,				SSE4
 __asm	pmaxub		SSE1,				SSE3
 		DeringA(SSE3, SSE4, SSE0, [esi + 1], [esi + 2*ebx + 1], SSE5, SSE6)
 __asm	pmaxub		SSE2,				SSE4
 __asm	pmaxub		SSE1,				SSE3
 		DeringM(SSE1, SSE2, SSE0, SSE5)
 __asm	SSE_MOVE	[edi + 1],			SSE0
 // now the pixels in the middle
 __asm	add			esi,				SSE_INCREMENT
 __asm	add			edi,				SSE_INCREMENT + 1
 __asm	mov			ecx,				hblocks
 __asm	align		16
 __asm	middle_loop:
 __asm	SSE3_MOVE	SSE0,				[esi + ebx + 1]
 		DeringA(SSE1, SSE2, SSE0, [esi], [esi + 2*ebx + 2], SSE5, SSE6)
 		DeringA(SSE3, SSE4, SSE0, [esi + 2], [esi + 2*ebx], SSE5, SSE6)
 __asm	pmaxub		SSE2,				SSE4
 __asm	pmaxub		SSE1,				SSE3
 		DeringA(SSE3, SSE4, SSE0, [esi + ebx], [esi + ebx + 2], SSE5, SSE6)
 __asm	pmaxub		SSE2,				SSE4
 __asm	pmaxub		SSE1,				SSE3
 		DeringA(SSE3, SSE4, SSE0, [esi + 1], [esi + 2*ebx + 1], SSE5, SSE6)
 __asm	pmaxub		SSE2,				SSE4
 __asm	pmaxub		SSE1,				SSE3
 		DeringM(SSE1, SSE2, SSE0, SSE5)
 __asm	SSE_MOVE	[edi],				SSE0		
 __asm	add			esi,				SSE_INCREMENT
 __asm	add			edi,				SSE_INCREMENT
 __asm	dec			ecx
 __asm	jnz			middle_loop
 // the last pixels
 __asm	add			esi,				edx
 __asm	add			edi,				edx
 __asm	SSE3_MOVE	SSE0,				[esi + ebx + 1]
 		DeringA(SSE1, SSE2, SSE0, [esi], [esi + 2*ebx + 2], SSE5, SSE6)
 		DeringA(SSE3, SSE4, SSE0, [esi + 2], [esi + 2*ebx], SSE5, SSE6)
 __asm	pmaxub		SSE2,				SSE4
 __asm	pmaxub		SSE1,				SSE3
 		DeringA_w(SSE3, SSE4, SSE0, [esi + ebx], [esi + ebx + 2], SSE_MOVE, [edi + 1], SSE5, SSE6)
 __asm	pmaxub		SSE2,				SSE4
 __asm	pmaxub		SSE1,				SSE3
 		DeringA(SSE3, SSE4, SSE0, [esi + 1], [esi + 2*ebx + 1], SSE5, SSE6)
 __asm	pmaxub		SSE2,				SSE4
 __asm	pmaxub		SSE1,				SSE3
 		DeringM(SSE1, SSE2, SSE0, SSE5)
 __asm	SSE_MOVE	[edi],				SSE0
 __asm	add			esi,				eax
 __asm	add			edi,				dpitch
 __asm	dec			height
 __asm	jnz			column_loop
 }

 #define	DeringC(ldiff, udiff, center, p1, p2, reg1, reg2) \
 __asm	SSE3_MOVE	reg2,				p1		\
 __asm	SSE3_MOVE	reg1,				p2		\
 __asm	SSE_RMOVE	ldiff,				reg2	\
 __asm	SSE_RMOVE	udiff,				center	\
 __asm	pmaxub		reg2,				reg1	\
 __asm	pminub		ldiff,				reg1	\
 __asm	psubusb		udiff,				reg2	\
 __asm	psubusb		reg2,				ldiff	\
 __asm	psubusb		ldiff,				center	\
 __asm	SSE_RMOVE	reg1,				reg2	\
 __asm	psubusb		reg2,				udiff	\
 __asm	psubusb		reg1,				ldiff	\
 __asm	pminub		udiff,				reg2	\
 __asm	pminub		ldiff,				reg1

 #define	DeringC_w(ldiff, udiff, center, p1, p2, mwrite, wmem, reg1, reg2) \
 __asm	SSE3_MOVE	reg2,				p1		\
 __asm	SSE3_MOVE	reg1,				p2		\
 __asm	SSE_RMOVE	ldiff,				reg2	\
 __asm	SSE_RMOVE	udiff,				center	\
 __asm	pmaxub		reg2,				reg1	\
 __asm	pminub		ldiff,				reg1	\
 __asm	psubusb		udiff,				reg2	\
 __asm	mwrite		wmem,				reg1	\
 __asm	psubusb		reg2,				ldiff	\
 __asm	psubusb		ldiff,				center	\
 __asm	SSE_RMOVE	reg1,				reg2	\
 __asm	psubusb		reg2,				udiff	\
 __asm	psubusb		reg1,				ldiff	\
 __asm	pminub		udiff,				reg2	\
 __asm	pminub		ldiff,				reg1

 void	SSE_RemoveGrain24(BYTE *dp, int dpitch, const BYTE *_sp, int spitch, int hblocks, int remainder, int incpitch, int height)
 {
 __asm	mov			eax,				hblocks
 __asm	mov			ebx,				spitch
 __asm	mov			edx,				remainder
 #if		SSE_INCREMENT == 16
 __asm	add			eax,				eax
 #endif
 __asm	mov			esi,				_sp
 __asm	lea			eax,				[eax * 8 + edx + SSE_INCREMENT + 1]
 __asm	sub			esi,				ebx
 __asm	sub			dpitch,				eax
 __asm	neg			eax
 __asm	mov			edi,				dp
 __asm	lea			eax,				[ebx + eax + 1]	
 __asm	align		16
 __asm	column_loop:
 __asm	SSE3_MOVE	SSE0,				[esi + ebx + 1]
 		DeringC(SSE1, SSE2, SSE0, [esi], [esi + 2*ebx + 2], SSE5, SSE6)
 		DeringC(SSE3, SSE4, SSE0, [esi + 2], [esi + 2*ebx], SSE5, SSE6)
 __asm	pmaxub		SSE2,				SSE4
 __asm	pmaxub		SSE1,				SSE3
 		DeringC_w(SSE3, SSE4, SSE0, [esi + ebx + 2], [esi + ebx], movd, [edi], SSE5, SSE6)
 __asm	pmaxub		SSE2,				SSE4
 __asm	pmaxub		SSE1,				SSE3
 		DeringC(SSE3, SSE4, SSE0, [esi + 1], [esi + 2*ebx + 1], SSE5, SSE6)
 __asm	pmaxub		SSE2,				SSE4
 __asm	pmaxub		SSE1,				SSE3
 		DeringM(SSE1, SSE2, SSE0, SSE5)
 __asm	SSE_MOVE	[edi + 1],			SSE0
 // now the pixels in the middle
 __asm	add			esi,				SSE_INCREMENT
 __asm	add			edi,				SSE_INCREMENT + 1
 __asm	mov			ecx,				hblocks
 __asm	align		16
 __asm	middle_loop:
 __asm	SSE3_MOVE	SSE0,				[esi + ebx + 1]
 		DeringC(SSE1, SSE2, SSE0, [esi], [esi + 2*ebx + 2], SSE5, SSE6)
 		DeringC(SSE3, SSE4, SSE0, [esi + 2], [esi + 2*ebx], SSE5, SSE6)
 __asm	pmaxub		SSE2,				SSE4
 __asm	pmaxub		SSE1,				SSE3
 		DeringC(SSE3, SSE4, SSE0, [esi + ebx], [esi + ebx + 2], SSE5, SSE6)
 __asm	pmaxub		SSE2,				SSE4
 __asm	pmaxub		SSE1,				SSE3
 		DeringC(SSE3, SSE4, SSE0, [esi + 1], [esi + 2*ebx + 1], SSE5, SSE6)
 __asm	pmaxub		SSE2,				SSE4
 __asm	pmaxub		SSE1,				SSE3
 		DeringM(SSE1, SSE2, SSE0, SSE5)
 __asm	SSE_MOVE	[edi],				SSE0		
 __asm	add			esi,				SSE_INCREMENT
 __asm	add			edi,				SSE_INCREMENT
 __asm	dec			ecx
 __asm	jnz			middle_loop
 // the last pixels
 __asm	add			esi,				edx
 __asm	add			edi,				edx
 __asm	SSE3_MOVE	SSE0,				[esi + ebx + 1]
 		DeringC(SSE1, SSE2, SSE0, [esi], [esi + 2*ebx + 2], SSE5, SSE6)
 		DeringC(SSE3, SSE4, SSE0, [esi + 2], [esi + 2*ebx], SSE5, SSE6)
 __asm	pmaxub		SSE2,				SSE4
 __asm	pmaxub		SSE1,				SSE3
 		DeringC_w(SSE3, SSE4, SSE0, [esi + ebx], [esi + ebx + 2], SSE_MOVE, [edi + 1], SSE5, SSE6)
 __asm	pmaxub		SSE2,				SSE4
 __asm	pmaxub		SSE1,				SSE3
 		DeringC(SSE3, SSE4, SSE0, [esi + 1], [esi + 2*ebx + 1], SSE5, SSE6)
 __asm	pmaxub		SSE2,				SSE4
 __asm	pmaxub		SSE1,				SSE3
 		DeringM(SSE1, SSE2, SSE0, SSE5)
 __asm	SSE_MOVE	[edi],				SSE0
 __asm	add			esi,				eax
 __asm	add			edi,				dpitch
 __asm	dec			height
 __asm	jnz			column_loop
 }


 #define	get_min_weightw1(min, weight, mem1, mem2, wmem, reg)		\
 __asm	SSE3_MOVE	min,				mem1		\
 __asm	SSE3_MOVE	reg,				mem2		\
 __asm	movd		wmem,				min			\
 __asm	SSE_RMOVE	weight,				min			\
 __asm	pminub		min,				reg			\
 __asm	pmaxub		weight,				reg			\
 __asm	psubusb		weight,				min

 void	WeirdBob(BYTE *dp, int dpitch, const BYTE *_sp, int spitch, int hblocks, int remainder, int incpitch, int height)
 {
 #ifdef	CVERSION
 	int	width = (hblocks + 2) * SSE_INCREMENT + remainder;
 	int	spitch2 = 2*spitch - width - 1, dpitch2 = 2*dpitch - width - 1;
 	_sp -= spitch;
 	do
 	{
 		dp[0] = (BYTE)(((unsigned)_sp[0] + (unsigned)(dp[dpitch] = _sp[2*spitch]) + 1) / 2);
 		++dp;
 		int	w = width;
 		do
 		{
 			unsigned	weight1, min1, weight2, min2;
 			min1 = _sp[0];
 			weight1 = _sp[2*spitch + 2];
 			if( weight1 < min1 ) 
 			{
 				min1 = weight1;
 				weight1 = _sp[0];
 			}
 			weight1 -= min1;

 			min2 = _sp[2];
 			weight2 = _sp[2*spitch];
 			if( weight2 < min2 ) 
 			{
 				min2 = weight2;
 				weight2 = _sp[2];
 			}
 			weight2 -= min2;
 			if( weight2 <= weight1 ) 
 			{
 				weight1 = weight2;
 				min1 = min2;
 			}
 			
 			++_sp;

 			min2 = _sp[0];
 			weight2 = dp[dpitch] = _sp[2*spitch];
 			if( weight2 < min2 ) 
 			{
 				min2 = weight2;
 				weight2 = _sp[0];
 			}
 			weight2 -= min2;
 			if( weight2 <= weight1 ) 
 			{
 				weight1 = weight2;
 				min1 = min2;
 			}

 			dp[0] = (BYTE) (min1 + (weight1 + 1)/2);

 			++dp;
 			
 		} while( --w );
 		++_sp;
 		dp[0] = (BYTE)(((unsigned)_sp[0] + (unsigned)(dp[dpitch] = _sp[2*spitch]) + 1)/2);
 		dp += dpitch2;
 		_sp += spitch2;
 	} while( --height );
 #else	// CVERSION
 __asm	mov			ecx,				incpitch
 __asm	mov			eax,				dpitch
 __asm	lea			ebx,				[2*eax + ecx]
 __asm	mov			edx,				remainder
 __asm	mov			dpitch,				ebx
 __asm	mov			esi,				_sp
 __asm	mov			ebx,				spitch
 __asm	mov			edi,				dp
 __asm	sub			esi,				ebx
 __asm	add			ebx,				ebx
 __asm	add			ecx,				ebx
 __asm	mov			spitch,				ecx
 __asm	mov			ecx,				hblocks
 __asm	align		16
 __asm	column_loop:
 		get_min_weight(SSE0, SSE1, SSE7, [esi + ebx + 2], [esi], SSE7)
 		get_min_weightw(SSE2, SSE3, SSE7, [esi + 2], [esi + ebx], [edi + eax], SSE6)
 __asm	pavgb		SSE6,				SSE7
 		mergeweighted(SSE0, SSE1, SSE2, SSE3)
 __asm	movd		[edi],				SSE6		
 		get_min_weight(SSE2, SSE3, SSE7, [esi + ebx + 1], [esi + 1], SSE7)
 		mergeweighted(SSE0, SSE1, SSE2, SSE3)
 __asm	paddusb		SSE1,				SSE0
 __asm	pavgb		SSE0,				SSE1
 __asm	SSE_MOVE	[edi + 1],			SSE0
 // now the pixels in the middle
 __asm	add			esi,				SSE_INCREMENT
 __asm	mov			ecx,				hblocks
 __asm	add			edi,				SSE_INCREMENT		
 __asm	align		16
 __asm	middle_loop:
 		get_min_weight(SSE0, SSE1, SSE7, [esi], [esi + ebx + 2], SSE7)
 		get_min_weightw(SSE2, SSE3, SSE7, [esi + 2], [esi + ebx], [edi + eax], SSE7)
 		mergeweighted(SSE0, SSE1, SSE2, SSE3)
 		get_min_weight(SSE2, SSE3, SSE7, [esi + 1], [esi + ebx + 1], SSE7)
 		mergeweighted(SSE0, SSE1, SSE2, SSE3)
 __asm	paddusb		SSE1,				SSE0
 __asm	pavgb		SSE0,				SSE1
 __asm	add			esi,				SSE_INCREMENT
 __asm	SSE_MOVE	[edi + 1],			SSE0
 __asm	add			edi,				SSE_INCREMENT
 __asm	dec			ecx
 __asm	jnz			middle_loop
 // the last pixels
 __asm	add			esi,				edx
 __asm	add			edi,				edx
 		get_min_weightw(SSE0, SSE1, SSE7, [esi], [esi + ebx + 2], [edi + eax + 2], SSE7)
 		get_min_weightw1(SSE2, SSE3, [esi + ebx], [esi + 2], [edi + eax], SSE6)
 __asm	pavgb		SSE6,				SSE7
 		mergeweighted(SSE0, SSE1, SSE2, SSE3)
 __asm	SSE_MOVE	[edi + 2],			SSE6		
 		get_min_weight(SSE2, SSE3, SSE7, [esi + ebx + 1], [esi + 1], SSE7)
 		mergeweighted(SSE0, SSE1, SSE2, SSE3)
 __asm	paddusb		SSE1,				SSE0
 __asm	pavgb		SSE0,				SSE1
 __asm	add			esi,				spitch
 __asm	SSE_MOVE	[edi + 1],			SSE0
 __asm	add			edi,				dpitch
 __asm	dec			height
 __asm	jnz			column_loop
 #endif	// CVERSION
 }


 void	bob_top(BYTE *dp, int dpitch, const BYTE *_sp, int spitch, int hblocks, int remainder, int incpitch, int height)
 {
 	memcpy(dp,_sp, (hblocks + 2) * SSE_INCREMENT + remainder + 2);
 	WeirdBob(dp + dpitch, dpitch, _sp + spitch, spitch, hblocks, remainder, incpitch, height / 2);
 }

 void	bob_bottom(BYTE *dp, int dpitch, const BYTE *_sp, int spitch, int hblocks, int remainder, int incpitch, int height)
 {
 	WeirdBob(dp, dpitch, _sp, spitch, hblocks, remainder, incpitch, (height + 1)/2);
 }

 #define	get_min_weight_average(min, weight, average, mem1, mem2, reg)		\
 __asm	SSE3_MOVE	min,				mem1		\
 __asm	SSE3_MOVE	reg,				mem2		\
 __asm	SSE_RMOVE	weight,				min			\
 __asm	SSE_RMOVE	average,			min			\
 __asm	pmaxub		weight,				reg			\
 __asm	pminub		min,				reg			\
 __asm	pavgb		average,			reg			\
 __asm	psubusb		weight,				min

 void	SmartBob(BYTE *dp, int dpitch, const BYTE *_sp, int spitch, int hblocks, int remainder, int incpitch, int height)
 {
 #ifdef	CVERSION
 	int	width = (hblocks + 2) * SSE_INCREMENT + remainder;
 	int	spitch2 = 2*spitch - width - 1, dpitch2 = 2*dpitch - width - 1;
 	_sp -= spitch;
 	do
 	{
 		dp[0] = (BYTE)(((unsigned)_sp[0] + (unsigned)(dp[dpitch] = _sp[2*spitch]) + 1) / 2);
 		++dp;
 		int	w = width;
 		do
 		{
 #if	1
 			unsigned	weight1, min1, weight2, min2, average;
 			min1 = _sp[0];
 			weight1 = _sp[2*spitch + 2];
 			if( weight1 < min1 ) 
 			{
 				min1 = weight1;
 				weight1 = _sp[0];
 			}
 			average = weight1 + min1;
 			weight1 -= min1;

 			min2 = _sp[2];
 			weight2 = _sp[2*spitch];
 			if( weight2 < min2 ) 
 			{
 				min2 = weight2;
 				weight2 = _sp[2];
 			}
 			average += min2 + weight2;
 			weight2 -= min2;
 			if( weight2 <= weight1 ) 
 			{
 				weight1 = weight2;
 				min1 = min2;
 			}
 			
 			++_sp;

 			min2 = _sp[0];
 			weight2 = dp[dpitch] = _sp[2*spitch];
 			if( weight2 < min2 ) 
 			{
 				min2 = weight2;
 				weight2 = _sp[0];
 			}
 			average += 2*(weight2 + min2);
 			weight2 -= min2;
 			if( weight2 <= weight1 ) 
 			{
 				weight1 = weight2;
 				min1 = min2;
 			}
 			average = (average + 4)/8;
 			weight1 += min1;
 			if( weight1 < average ) average = weight1;
 			else if( min1 > average ) average = min1;
 			dp[0] = (BYTE) average;

 			++dp;
 #else	
 			unsigned	weight1, min1, weight2, min2, average;
 			min1 = _sp[0];
 			weight1 = _sp[2*spitch + 2];
 			if( weight1 < min1 ) 
 			{
 				min1 = weight1;
 				weight1 = _sp[0];
 			}
 			average = (weight1 + min1 + 1)/2;
 			weight1 -= min1;

 			min2 = _sp[2];
 			weight2 = _sp[2*spitch];
 			if( weight2 < min2 ) 
 			{
 				min2 = weight2;
 				weight2 = _sp[2];
 			}
 			average = (average + (min2 + weight2 + 1)/2 + 1)/2 - 1;
 			if( (int) average < 0 ) average = 0;
 			weight2 -= min2;
 			if( weight2 <= weight1 ) 
 			{
 				weight1 = weight2;
 				min1 = min2;
 			}
 			
 			++_sp;

 			min2 = _sp[0];
 			weight2 = dp[dpitch] = _sp[2*spitch];
 			if( weight2 < min2 ) 
 			{
 				min2 = weight2;
 				weight2 = _sp[0];
 			}
 			average = (average + (weight2 + min2 + 1)/2 + 1)/2;
 			weight2 -= min2;
 			if( weight2 <= weight1 ) 
 			{
 				weight1 = weight2;
 				min1 = min2;
 			}
 			weight1 += min1;
 			if( weight1 < average ) average = weight1;
 			else if( min1 > average ) average = min1;
 			dp[0] = (BYTE) average;

 			++dp;
 #endif			
 		} while( --w );
 		++_sp;
 		dp[0] = (BYTE)(((unsigned)_sp[0] + (unsigned)(dp[dpitch] = _sp[2*spitch]) + 1)/2);
 		dp += dpitch2;
 		_sp += spitch2;
 	} while( --height );
 #else	// CVERSION
 __asm	mov			ecx,				incpitch
 __asm	mov			eax,				dpitch
 __asm	lea			ebx,				[2*eax + ecx]
 __asm	mov			edx,				remainder
 __asm	mov			dpitch,				ebx
 __asm	mov			esi,				_sp
 __asm	mov			ebx,				spitch
 __asm	mov			edi,				dp
 __asm	sub			esi,				ebx
 __asm	add			ebx,				ebx
 //__asm	pxor		SSE6,				SSE6
 __asm	SSE_RMOVE	SSE6,				fconvolution_bias
 __asm	add			ecx,				ebx
 __asm	mov			spitch,				ecx
 __asm	mov			ecx,				hblocks
 __asm	align		16
 __asm	column_loop:
 		get_min_weight_average(SSE0, SSE1, SSE2, [esi + ebx + 2], [esi], SSE7)
 __asm	SSE3_MOVE	SSE4,				[esi + ebx]		
 __asm	SSE3_MOVE	SSE5,				[esi + 2]
 __asm	pavgb		SSE7,				SSE4
 __asm	SSE_RMOVE	SSE3,				SSE4
 __asm	movd		[edi],				SSE7
 __asm	SSE_RMOVE	SSE7,				SSE4
 __asm	pminub		SSE3,				SSE5
 __asm	SSE_MOVE	[edi + eax],		SSE7 
 __asm	pmaxub		SSE4,				SSE5
 __asm	pavgb		SSE5,				SSE7
 __asm	psubusb		SSE4,				SSE3
 __asm	pavgb		SSE2,				SSE5
 		mergeweighted(SSE0, SSE1, SSE3, SSE4)
 __asm	psubusb		SSE2,				SSE6	// bias correction
 		get_min_weight_average(SSE3, SSE4, SSE5, [esi + ebx + 1], [esi + 1], SSE7)
 __asm	pavgb		SSE2,				SSE5
 		mergeweighted(SSE0, SSE1, SSE3, SSE4)
 __asm	pmaxub		SSE2,				SSE0
 __asm	paddusb		SSE1,				SSE0
 __asm	pminub		SSE2,				SSE1
 __asm	SSE_MOVE	[edi + 1],			SSE2	
 // now the pixels in the middle
 __asm	add			esi,				SSE_INCREMENT
 __asm	mov			ecx,				hblocks
 __asm	add			edi,				SSE_INCREMENT		
 __asm	align		16
 __asm	middle_loop:
 		get_min_weight_average(SSE0, SSE1, SSE2, [esi + ebx + 2], [esi], SSE7)
 		get_min_weight_average(SSE3, SSE4, SSE5, [esi + 2], [esi + ebx], SSE7)
 __asm	SSE_MOVE	[edi + eax],		SSE7
 __asm	pavgb		SSE2,				SSE5
 		mergeweighted(SSE0, SSE1, SSE3, SSE4)
 __asm	psubusb		SSE2,				SSE6	// bias correction
 		get_min_weight_average(SSE3, SSE4, SSE5, [esi + 1], [esi + ebx + 1], SSE7)
 __asm	pavgb		SSE2,				SSE5
 		mergeweighted(SSE0, SSE1, SSE3, SSE4)
 __asm	pmaxub		SSE2,				SSE0
 __asm	paddusb		SSE1,				SSE0
 __asm	pminub		SSE2,				SSE1
 __asm	add			esi,				SSE_INCREMENT
 __asm	SSE_MOVE	[edi + 1],			SSE2
 __asm	add			edi,				SSE_INCREMENT
 __asm	dec			ecx
 __asm	jnz			middle_loop
 // the last pixels
 __asm	add			esi,				edx
 __asm	add			edi,				edx
 		get_min_weight_average(SSE0, SSE1, SSE2, [esi], [esi + ebx + 2], SSE7)
 __asm	SSE_MOVE	[edi + eax + 2],	SSE7
 __asm	SSE3_MOVE	SSE4,				[esi + 2]		
 __asm	SSE3_MOVE	SSE5,				[esi + ebx]				
 __asm	pavgb		SSE7,				SSE4
 __asm	SSE_RMOVE	SSE3,				SSE4
 __asm	SSE_MOVE	[edi + 2],			SSE7
 __asm	SSE_RMOVE	SSE7,				SSE4
 __asm	pminub		SSE3,				SSE5
 __asm	movd		[edi + eax],		SSE5
 __asm	pmaxub		SSE4,				SSE5
 __asm	pavgb		SSE5,				SSE7
 __asm	psubusb		SSE4,				SSE3
 __asm	pavgb		SSE2,				SSE5
 		mergeweighted(SSE0, SSE1, SSE3, SSE4)
 __asm	psubusb		SSE2,				SSE6	// bias correction
 		get_min_weight_average(SSE3, SSE4, SSE5, [esi + ebx + 1], [esi + 1], SSE7)
 __asm	pavgb		SSE2,				SSE5
 		mergeweighted(SSE0, SSE1, SSE3, SSE4)
 __asm	pmaxub		SSE2,				SSE0
 __asm	paddusb		SSE1,				SSE0
 __asm	pminub		SSE2,				SSE1
 __asm	add			esi,				spitch
 __asm	SSE_MOVE	[edi + 1],			SSE2	
 __asm	add			edi,				dpitch
 __asm	dec			height
 __asm	jnz			column_loop
 #endif	// CVERSION
 }

 void	smartbob_top(BYTE *dp, int dpitch, const BYTE *_sp, int spitch, int hblocks, int remainder, int incpitch, int height)
 {
 	memcpy(dp,_sp, (hblocks + 2) * SSE_INCREMENT + remainder + 2);
 	SmartBob(dp + dpitch, dpitch, _sp + spitch, spitch, hblocks, remainder, incpitch, height / 2);
 }

 void	smartbob_bottom(BYTE *dp, int dpitch, const BYTE *_sp, int spitch, int hblocks, int remainder, int incpitch, int height)
 {
 	SmartBob(dp, dpitch, _sp, spitch, hblocks, remainder, incpitch, (height + 1)/2);
 }

 #endif	// #ifndef	MODIFYPLUGIN

 #ifdef	SHARPEN
 void	SmartRG(BYTE *dp, int dpitch, const BYTE *_sp, int spitch, int hblocks, int remainder, int incpitch, int height, int strength)
 #else
 void	SmartRG(BYTE *dp, int dpitch, const BYTE *_sp, int spitch, int hblocks, int remainder, int incpitch, int height)
 #endif
 {
 __asm	mov			eax,				hblocks
 __asm	mov			ebx,				spitch
 #ifdef	MODIFYPLUGIN
 __asm	mov			ecx,				eax
 #endif
 __asm	mov			edx,				remainder
 #if		SSE_INCREMENT == 16
 __asm	add			eax,				eax
 #endif
 __asm	mov			esi,				_sp
 #ifdef	MODIFYPLUGIN
 __asm	lea			eax,				[eax * 8 + edx]
 #else
 __asm	lea			eax,				[eax * 8 + edx + SSE_INCREMENT + 1]
 #endif
 __asm	sub			esi,				ebx
 __asm	sub			dpitch,				eax
 __asm	neg			eax
 __asm	mov			edi,				dp
 #ifdef	MODIFYPLUGIN
 __asm	inc			edi
 __asm	lea			eax,				[ebx + eax]	
 #else
 __asm	lea			eax,				[ebx + eax + 1]	
 #ifdef	SHARPEN
 __asm	mov			spitch,				eax
 __asm	mov			eax,				strength
 #endif
 __asm	align		16
 __asm	column_loop:
 __asm	SSE3_MOVE	SSE0,				[esi]
 __asm	SSE3_MOVE	SSE7,				[esi + 2*ebx + 2]
 __asm	SSE_RMOVE	SSE1,				SSE0
 __asm	pminub		SSE0,				SSE7
 __asm	pmaxub		SSE1,				SSE7
 __asm	SSE3_MOVE	SSE4,				[esi + 2]
 __asm	SSE3_MOVE	SSE7,				[esi + 2*ebx]
 __asm	SSE_RMOVE	SSE5,				SSE4
 __asm	SSE3_MOVE	SSE2,				[esi + 1]
 __asm	pminub		SSE4,				SSE7
 __asm	pmaxub		SSE5,				SSE7
 __asm	pmaxub		SSE0,				SSE4
 __asm	SSE3_MOVE	SSE7,				[esi + 2*ebx + 1]
 __asm	SSE_RMOVE	SSE3,				SSE2
 __asm	pminub		SSE1,				SSE5

 __asm	pminub		SSE2,				SSE7
 __asm	SSE3_MOVE	SSE4,				[esi + ebx]
 __asm	pmaxub		SSE3,				SSE7
 __asm	movd		[edi],				SSE4
 __asm	pmaxub		SSE0,				SSE2
 __asm	SSE3_MOVE	SSE7,				[esi + ebx + 2]
 __asm	SSE_RMOVE	SSE5,				SSE4
 __asm	pminub		SSE1,				SSE3

 __asm	pminub		SSE4,				SSE7
 __asm	pmaxub		SSE5,				SSE7
 __asm	pmaxub		SSE0,				SSE4
 __asm	pminub		SSE1,				SSE5
 __asm	SSE_RMOVE	SSE2,				SSE0
 #if	(ISSE > 1) || defined(SHLUR)
 __asm	SSE3_MOVE	SSE4,				[esi + ebx + 1]				
 #endif
 __asm	pminub		SSE0,				SSE1
 __asm	pmaxub		SSE2,				SSE1
 #ifdef	SHLUR
 		sharpen(SSE4, SSE0, SSE2, rshift[eax], shift_mask[eax], SSE7, SSE3)
 __asm	SSE_MOVE	[edi + 1],			SSE4
 #else
 #if	ISSE > 1
 __asm	pmaxub		SSE0,				SSE4
 #else
 __asm	pmaxub		SSE0,				[esi + ebx + 1]	
 #endif
 __asm	pminub		SSE0,				SSE2
 __asm	SSE_MOVE	[edi + 1],			SSE0
 #endif	// SHLUR
 // now the pixels in the middle
 __asm	add			esi,				SSE_INCREMENT
 __asm	add			edi,				SSE_INCREMENT + 1
 __asm	mov			ecx,				hblocks
 #endif // MODIFYPLUGIN
 __asm	align		16
 __asm	middle_loop:
 __asm	SSE3_MOVE	SSE0,				[esi]
 __asm	SSE3_MOVE	SSE7,				[esi + 2*ebx + 2]
 __asm	SSE_RMOVE	SSE1,				SSE0
 __asm	SSE3_MOVE	SSE4,				[esi + 2]
 __asm	pminub		SSE0,				SSE7
 __asm	SSE3_MOVE	SSE6,				[esi + 2*ebx]
 __asm	SSE_RMOVE	SSE5,				SSE4
 __asm	pmaxub		SSE1,				SSE7
 __asm	pminub		SSE4,				SSE6
 __asm	SSE3_MOVE	SSE2,				[esi + 1]
 __asm	pmaxub		SSE5,				SSE6
 __asm	pmaxub		SSE0,				SSE4
 __asm	SSE3_MOVE	SSE7,				[esi + 2*ebx + 1]
 __asm	SSE_RMOVE	SSE3,				SSE2
 __asm	pminub		SSE1,				SSE5

 __asm	pminub		SSE2,				SSE7
 __asm	SSE3_MOVE	SSE4,				[esi + ebx]
 __asm	pmaxub		SSE3,				SSE7
 __asm	pmaxub		SSE0,				SSE2
 __asm	SSE3_MOVE	SSE7,				[esi + ebx + 2]
 __asm	SSE_RMOVE	SSE5,				SSE4
 __asm	pminub		SSE1,				SSE3

 __asm	pminub		SSE4,				SSE7
 __asm	pmaxub		SSE5,				SSE7
 __asm	pmaxub		SSE0,				SSE4
 __asm	pminub		SSE1,				SSE5
 __asm	SSE_RMOVE	SSE2,				SSE0

 #if		(ISSE > 1) || defined(SHLUR)
 #ifdef	MODIFYPLUGIN
 __asm	SSE3_MOVE	SSE4,				[edi]
 #else
 __asm	SSE3_MOVE	SSE4,				[esi + ebx + 1]	
 #endif
 #endif
 #if		MODIFYPLUGIN > 0
 __asm	SSE3_MOVE	SSE5,				[esi + ebx + 1]		
 #endif

 __asm	pminub		SSE0,				SSE1
 __asm	pmaxub		SSE2,				SSE1
 #if		MODIFYPLUGIN > 0
 __asm	pminub		SSE0,				SSE5
 __asm	pmaxub		SSE2,				SSE5
 #endif

 #ifdef	SHLUR
 		sharpen(SSE4, SSE0, SSE2, rshift[eax], shift_mask[eax], SSE7, SSE3)
 __asm	add			esi,				SSE_INCREMENT
 __asm	SSE_MOVE	[edi],				SSE4
 #else
 #if	ISSE > 1
 __asm	pmaxub		SSE0,				SSE4
 #else
 #ifdef	MODIFYPLUGIN
 __asm	pmaxub		SSE0,				[edi]
 #else
 __asm	pmaxub		SSE0,				[esi + ebx + 1]	
 #endif
 #endif
 __asm	pminub		SSE0,				SSE2
 __asm	add			esi,				SSE_INCREMENT
 __asm	SSE_MOVE	[edi],				SSE0
 #endif	// SHLUR
 __asm	add			edi,				SSE_INCREMENT
 __asm	dec			ecx
 __asm	jnz			middle_loop
 // the last pixels
 __asm	add			esi,				edx
 __asm	SSE3_MOVE	SSE0,				[esi]
 __asm	add			edi,				edx
 __asm	SSE3_MOVE	SSE7,				[esi + 2*ebx + 2]
 __asm	SSE_RMOVE	SSE1,				SSE0
 __asm	SSE3_MOVE	SSE4,				[esi + 2]
 __asm	pminub		SSE0,				SSE7
 __asm	pmaxub		SSE1,				SSE7
 __asm	SSE3_MOVE	SSE6,				[esi + 2*ebx]
 __asm	SSE_RMOVE	SSE5,				SSE4
 __asm	SSE3_MOVE	SSE2,				[esi + 1]
 __asm	pminub		SSE4,				SSE6
 __asm	pmaxub		SSE5,				SSE6
 __asm	pmaxub		SSE0,				SSE4
 __asm	SSE3_MOVE	SSE7,				[esi + 2*ebx + 1]
 __asm	SSE_RMOVE	SSE3,				SSE2
 __asm	pminub		SSE1,				SSE5

 __asm	pminub		SSE2,				SSE7
 __asm	SSE3_MOVE	SSE4,				[esi + ebx]
 __asm	pmaxub		SSE3,				SSE7
 __asm	pmaxub		SSE0,				SSE2
 __asm	SSE3_MOVE	SSE7,				[esi + ebx + 2]
 __asm	SSE_RMOVE	SSE5,				SSE4
 #ifndef	MODIFYPLUGIN
 __asm	SSE_MOVE	[edi + 1],			SSE7
 #endif
 __asm	pminub		SSE1,				SSE3

 __asm	pminub		SSE4,				SSE7
 __asm	pmaxub		SSE5,				SSE7
 __asm	pmaxub		SSE0,				SSE4
 __asm	pminub		SSE1,				SSE5
 __asm	SSE_RMOVE	SSE2,				SSE0

 #if		(ISSE > 1) || defined(SHLUR)
 #ifdef	MODIFYPLUGIN
 __asm	SSE3_MOVE	SSE4,				[edi]
 #else
 __asm	SSE3_MOVE	SSE4,				[esi + ebx + 1]	
 #endif
 #endif
 #if		MODIFYPLUGIN > 0
 __asm	SSE3_MOVE	SSE5,				[esi + ebx + 1]		
 #endif
 __asm	pminub		SSE0,				SSE1
 __asm	pmaxub		SSE2,				SSE1
 #if		MODIFYPLUGIN > 0
 __asm	pminub		SSE0,				SSE5
 __asm	pmaxub		SSE2,				SSE5
 #endif

 #ifdef	SHLUR
 		sharpen(SSE4, SSE0, SSE2, rshift[eax], shift_mask[eax], SSE7, SSE3)
 #ifdef	SHARPEN
 __asm	add			esi,				spitch
 #else
 __asm	add			esi,				eax
 #endif
 __asm	SSE_MOVE	[edi],				SSE4
 #else
 #if	ISSE > 1
 __asm	pmaxub		SSE0,				SSE4
 #else
 #ifdef	MODIFYPLUGIN
 __asm	pmaxub		SSE0,				[edi]
 #else
 __asm	pmaxub		SSE0,				[esi + ebx + 1]	
 #endif
 #endif
 __asm	pminub		SSE0,				SSE2
 __asm	add			esi,				eax
 __asm	SSE_MOVE	[edi],				SSE0
 #endif	// SHLUR
 __asm	add			edi,				dpitch
 __asm	dec			height
 #ifdef	MODIFYPLUGIN
 __asm	mov			ecx,				hblocks
 __asm	jnz			middle_loop
 #else
 __asm	jnz			column_loop
 #endif
 }

 #ifdef	SHARPEN
 void	SmartRGC(BYTE *dp, int dpitch, const BYTE *_sp, int spitch, int hblocks, int remainder, int incpitch, int height, int strength)
 #else
 void	SmartRGC(BYTE *dp, int dpitch, const BYTE *_sp, int spitch, int hblocks, int remainder, int incpitch, int height)
 #endif
 {
 __asm	mov			eax,				hblocks
 __asm	mov			ebx,				spitch
 #ifdef	MODIFYPLUGIN
 __asm	mov			ecx,				eax
 #endif
 __asm	mov			edx,				remainder
 #if		SSE_INCREMENT == 16
 __asm	add			eax,				eax
 #endif
 __asm	mov			esi,				_sp
 #ifdef	MODIFYPLUGIN
 __asm	lea			eax,				[eax * 8 + edx]
 #else
 __asm	lea			eax,				[eax * 8 + edx + SSE_INCREMENT + 1]
 #endif
 __asm	sub			esi,				ebx
 __asm	sub			dpitch,				eax
 __asm	neg			eax
 __asm	mov			edi,				dp
 #ifdef	MODIFYPLUGIN
 __asm	inc			edi
 __asm	lea			eax,				[ebx + eax]	
 #else
 __asm	lea			eax,				[ebx + eax + 1]	
 #ifdef	SHARPEN
 __asm	mov			spitch,				eax
 __asm	mov			eax,				strength
 #endif
 __asm	align		16
 __asm	column_loop:
 __asm	SSE3_MOVE	SSE7,				[esi]
 __asm	SSE3_MOVE	SSE6,				[esi + 1]
 __asm	SSE_RMOVE	SSE0,				SSE7
 __asm	SSE_RMOVE	SSE1,				SSE6
 __asm	SSE3_MOVE	SSE5,				[esi + 2]
 __asm	pminub		SSE0,				SSE6
 __asm	SSE_RMOVE	SSE2,				SSE5
 __asm	pmaxub		SSE1,				SSE7
 __asm	pminub		SSE2,				SSE6
 __asm	SSE3_MOVE	SSE4,				[esi + ebx + 2]
 __asm	pmaxub		SSE6,				SSE5
 __asm	pmaxub		SSE0,				SSE2
 __asm	SSE_RMOVE	SSE3,				SSE4
 __asm	pminub		SSE1,				SSE6
 __asm	pminub		SSE3,				SSE5
 __asm	SSE3_MOVE	SSE6,				[esi + 2*ebx + 2]
 __asm	pmaxub		SSE5,				SSE4
 __asm	pmaxub		SSE0,				SSE3
 __asm	SSE_RMOVE	SSE2,				SSE6
 __asm	pminub		SSE1,				SSE5
 __asm	pminub		SSE2,				SSE4
 __asm	SSE3_MOVE	SSE5,				[esi + 2*ebx + 1]
 __asm	pmaxub		SSE4,				SSE6
 __asm	pmaxub		SSE0,				SSE2
 __asm	SSE_RMOVE	SSE3,				SSE5
 __asm	pminub		SSE1,				SSE4
 __asm	pminub		SSE3,				SSE6
 __asm	SSE3_MOVE	SSE4,				[esi + 2*ebx]
 __asm	pmaxub		SSE6,				SSE5
 __asm	pmaxub		SSE0,				SSE3
 __asm	SSE_RMOVE	SSE2,				SSE4
 __asm	pminub		SSE1,				SSE6
 __asm	pminub		SSE2,				SSE5
 __asm	SSE3_MOVE	SSE6,				[esi + ebx]
 __asm	pmaxub		SSE5,				SSE4
 __asm	pmaxub		SSE0,				SSE2
 __asm	SSE_RMOVE	SSE3,				SSE6
 __asm	pminub		SSE1,				SSE5
 __asm	movd		[edi],				SSE6
 __asm	pminub		SSE3,				SSE4
 __asm	SSE_RMOVE	SSE2,				SSE7
 __asm	pmaxub		SSE4,				SSE6
 __asm	pmaxub		SSE0,				SSE3
 __asm	pminub		SSE1,				SSE4
 __asm	pminub		SSE2,				SSE6
 __asm	pmaxub		SSE7,				SSE6
 __asm	pmaxub		SSE0,				SSE2
 __asm	pminub		SSE1,				SSE7

 __asm	SSE_RMOVE	SSE2,				SSE0
 #if	(ISSE > 1) || defined(SHLUR)
 __asm	SSE3_MOVE	SSE4,				[esi + ebx + 1]				
 #endif
 __asm	pminub		SSE0,				SSE1
 __asm	pmaxub		SSE2,				SSE1
 #ifdef	SHLUR
 		sharpen(SSE4, SSE0, SSE2, rshift[eax], shift_mask[eax], SSE7, SSE3)
 __asm	SSE_MOVE	[edi + 1],			SSE4
 #else
 #if	ISSE > 1
 __asm	pmaxub		SSE0,				SSE4
 #else
 __asm	pmaxub		SSE0,				[esi + ebx + 1]	
 #endif
 __asm	pminub		SSE0,				SSE2
 __asm	SSE_MOVE	[edi + 1],			SSE0
 #endif	// SHLUR
 // now the pixels in the middle
 __asm	add			esi,				SSE_INCREMENT
 __asm	add			edi,				SSE_INCREMENT + 1
 __asm	mov			ecx,				hblocks
 #endif // MODIFYPLUGIN
 __asm	align		16
 __asm	middle_loop:
 __asm	SSE3_MOVE	SSE7,				[esi]
 __asm	SSE3_MOVE	SSE6,				[esi + 1]
 __asm	SSE_RMOVE	SSE0,				SSE7
 __asm	SSE_RMOVE	SSE1,				SSE6
 __asm	SSE3_MOVE	SSE5,				[esi + 2]
 __asm	pminub		SSE0,				SSE6
 __asm	SSE_RMOVE	SSE2,				SSE5
 __asm	pmaxub		SSE1,				SSE7
 __asm	pminub		SSE2,				SSE6
 __asm	SSE3_MOVE	SSE4,				[esi + ebx + 2]
 __asm	pmaxub		SSE6,				SSE5
 __asm	pmaxub		SSE0,				SSE2
 __asm	SSE_RMOVE	SSE3,				SSE4
 __asm	pminub		SSE1,				SSE6
 __asm	pminub		SSE3,				SSE5
 __asm	SSE3_MOVE	SSE6,				[esi + 2*ebx + 2]
 __asm	pmaxub		SSE5,				SSE4
 __asm	pmaxub		SSE0,				SSE3
 __asm	SSE_RMOVE	SSE2,				SSE6
 __asm	pminub		SSE1,				SSE5
 __asm	pminub		SSE2,				SSE4
 __asm	SSE3_MOVE	SSE5,				[esi + 2*ebx + 1]
 __asm	pmaxub		SSE4,				SSE6
 __asm	pmaxub		SSE0,				SSE2
 __asm	SSE_RMOVE	SSE3,				SSE5
 __asm	pminub		SSE1,				SSE4
 __asm	pminub		SSE3,				SSE6
 __asm	SSE3_MOVE	SSE4,				[esi + 2*ebx]
 __asm	pmaxub		SSE6,				SSE5
 __asm	pmaxub		SSE0,				SSE3
 __asm	SSE_RMOVE	SSE2,				SSE4
 __asm	pminub		SSE1,				SSE6
 __asm	pminub		SSE2,				SSE5
 __asm	SSE3_MOVE	SSE6,				[esi + ebx]
 __asm	pmaxub		SSE5,				SSE4
 __asm	pmaxub		SSE0,				SSE2
 __asm	SSE_RMOVE	SSE3,				SSE6
 __asm	pminub		SSE1,				SSE5
 __asm	pminub		SSE3,				SSE4
 __asm	SSE_RMOVE	SSE2,				SSE7
 __asm	pmaxub		SSE4,				SSE6
 __asm	pmaxub		SSE0,				SSE3
 __asm	pminub		SSE1,				SSE4
 __asm	pminub		SSE2,				SSE6
 __asm	pmaxub		SSE7,				SSE6
 __asm	pmaxub		SSE0,				SSE2
 __asm	pminub		SSE1,				SSE7

 __asm	SSE_RMOVE	SSE2,				SSE0
 #if		(ISSE > 1) || defined(SHLUR)
 #ifdef	MODIFYPLUGIN
 __asm	SSE3_MOVE	SSE4,				[edi]
 #else
 __asm	SSE3_MOVE	SSE4,				[esi + ebx + 1]	
 #endif
 #endif
 #if		MODIFYPLUGIN > 0
 __asm	SSE3_MOVE	SSE5,				[esi + ebx + 1]		
 #endif

 __asm	pminub		SSE0,				SSE1
 __asm	pmaxub		SSE2,				SSE1
 #if		MODIFYPLUGIN > 0
 __asm	pminub		SSE0,				SSE5
 __asm	pmaxub		SSE2,				SSE5
 #endif

 #ifdef	SHLUR
 		sharpen(SSE4, SSE0, SSE2, rshift[eax], shift_mask[eax], SSE7, SSE3)
 __asm	add			esi,				SSE_INCREMENT
 __asm	SSE_MOVE	[edi],				SSE4
 #else
 #if	ISSE > 1
 __asm	pmaxub		SSE0,				SSE4
 #else
 #ifdef	MODIFYPLUGIN
 __asm	pmaxub		SSE0,				[edi]
 #else
 __asm	pmaxub		SSE0,				[esi + ebx + 1]	
 #endif
 #endif
 __asm	pminub		SSE0,				SSE2
 __asm	add			esi,				SSE_INCREMENT
 __asm	SSE_MOVE	[edi],				SSE0
 #endif	// SHLUR
 __asm	add			edi,				SSE_INCREMENT
 __asm	dec			ecx
 __asm	jnz			middle_loop
 // the last pixels
 __asm	add			esi,				edx
 __asm	SSE3_MOVE	SSE7,				[esi]
 __asm	add			edi,				edx
 __asm	SSE3_MOVE	SSE6,				[esi + 1]
 __asm	SSE_RMOVE	SSE0,				SSE7
 __asm	SSE_RMOVE	SSE1,				SSE6
 __asm	SSE3_MOVE	SSE5,				[esi + 2]
 __asm	pminub		SSE0,				SSE6
 __asm	SSE_RMOVE	SSE2,				SSE5
 __asm	pmaxub		SSE1,				SSE7
 __asm	pminub		SSE2,				SSE6
 __asm	SSE3_MOVE	SSE4,				[esi + ebx + 2]
 __asm	pmaxub		SSE6,				SSE5
 __asm	pmaxub		SSE0,				SSE2
 __asm	SSE_RMOVE	SSE3,				SSE4
 __asm	pminub		SSE1,				SSE6
 __asm	SSE_MOVE	[edi + 1],			SSE4
 __asm	pminub		SSE3,				SSE5
 __asm	SSE3_MOVE	SSE6,				[esi + 2*ebx + 2]
 __asm	pmaxub		SSE5,				SSE4
 __asm	pmaxub		SSE0,				SSE3
 __asm	SSE_RMOVE	SSE2,				SSE6
 __asm	pminub		SSE1,				SSE5
 __asm	pminub		SSE2,				SSE4
 __asm	SSE3_MOVE	SSE5,				[esi + 2*ebx + 1]
 __asm	pmaxub		SSE4,				SSE6
 __asm	pmaxub		SSE0,				SSE2
 __asm	SSE_RMOVE	SSE3,				SSE5
 __asm	pminub		SSE1,				SSE4
 __asm	pminub		SSE3,				SSE6
 __asm	SSE3_MOVE	SSE4,				[esi + 2*ebx]
 __asm	pmaxub		SSE6,				SSE5
 __asm	pmaxub		SSE0,				SSE3
 __asm	SSE_RMOVE	SSE2,				SSE4
 __asm	pminub		SSE1,				SSE6
 __asm	pminub		SSE2,				SSE5
 __asm	SSE3_MOVE	SSE6,				[esi + ebx]
 __asm	pmaxub		SSE5,				SSE4
 __asm	pmaxub		SSE0,				SSE2
 __asm	SSE_RMOVE	SSE3,				SSE6
 __asm	pminub		SSE1,				SSE5
 __asm	pminub		SSE3,				SSE4
 __asm	SSE_RMOVE	SSE2,				SSE7
 __asm	pmaxub		SSE4,				SSE6
 __asm	pmaxub		SSE0,				SSE3
 __asm	pminub		SSE1,				SSE4
 __asm	pminub		SSE2,				SSE6
 __asm	pmaxub		SSE7,				SSE6
 __asm	pmaxub		SSE0,				SSE2
 __asm	pminub		SSE1,				SSE7

 __asm	SSE_RMOVE	SSE2,				SSE0
 #if		(ISSE > 1) || defined(SHLUR)
 #ifdef	MODIFYPLUGIN
 __asm	SSE3_MOVE	SSE4,				[edi]
 #else
 __asm	SSE3_MOVE	SSE4,				[esi + ebx + 1]	
 #endif
 #endif
 #if		MODIFYPLUGIN > 0
 __asm	SSE3_MOVE	SSE5,				[esi + ebx + 1]		
 #endif
 __asm	pminub		SSE0,				SSE1
 __asm	pmaxub		SSE2,				SSE1
 #if		MODIFYPLUGIN > 0
 __asm	pminub		SSE0,				SSE5
 __asm	pmaxub		SSE2,				SSE5
 #endif

 #ifdef	SHLUR
 		sharpen(SSE4, SSE0, SSE2, rshift[eax], shift_mask[eax], SSE7, SSE3)
 #ifdef	SHARPEN
 __asm	add			esi,				spitch
 #else
 __asm	add			esi,				eax
 #endif
 __asm	SSE_MOVE	[edi],				SSE4
 #else
 #if	ISSE > 1
 __asm	pmaxub		SSE0,				SSE4
 #else
 #ifdef	MODIFYPLUGIN
 __asm	pmaxub		SSE0,				[edi]
 #else
 __asm	pmaxub		SSE0,				[esi + ebx + 1]	
 #endif
 #endif
 __asm	pminub		SSE0,				SSE2
 __asm	add			esi,				eax
 __asm	SSE_MOVE	[edi],				SSE0
 #endif	// SHLUR
 __asm	add			edi,				dpitch
 __asm	dec			height
 #ifdef	MODIFYPLUGIN
 __asm	mov			ecx,				hblocks
 __asm	jnz			middle_loop
 #else
 __asm	jnz			column_loop
 #endif
 }

 #ifdef	SHARPEN
 void	SmartRGCL(BYTE *dp, int dpitch, const BYTE *_sp, int spitch, int hblocks, int remainder, int incpitch, int height, int strength)
 #else
 void	SmartRGCL(BYTE *dp, int dpitch, const BYTE *_sp, int spitch, int hblocks, int remainder, int incpitch, int height)
 #endif
 {
 __asm	mov			eax,				hblocks
 __asm	mov			ebx,				spitch
 #ifdef	MODIFYPLUGIN
 __asm	mov			ecx,				eax
 #endif
 __asm	mov			edx,				remainder
 #if		SSE_INCREMENT == 16
 __asm	add			eax,				eax
 #endif
 __asm	mov			esi,				_sp
 #ifdef	MODIFYPLUGIN
 __asm	lea			eax,				[eax * 8 + edx]
 #else
 __asm	lea			eax,				[eax * 8 + edx + SSE_INCREMENT + 1]
 #endif
 __asm	sub			esi,				ebx
 __asm	sub			dpitch,				eax
 __asm	neg			eax
 __asm	mov			edi,				dp
 #ifdef	MODIFYPLUGIN
 __asm	inc			edi
 __asm	lea			eax,				[ebx + eax]	
 #else
 __asm	lea			eax,				[ebx + eax + 1]	
 #ifdef	SHARPEN
 __asm	mov			spitch,				eax
 __asm	mov			eax,				strength
 #endif
 __asm	align		16
 __asm	column_loop:
 __asm	SSE3_MOVE	SSE4,				[esi]
 __asm	SSE3_MOVE	SSE5,				[esi + 2*ebx + 2]
 __asm	SSE_RMOVE	SSE0,				SSE4
 __asm	SSE_RMOVE	SSE1,				SSE5
 __asm	SSE3_MOVE	SSE6,				[esi + 1]
 __asm	pminub		SSE0,				SSE5
 __asm	SSE_RMOVE	SSE2,				SSE6
 __asm	pmaxub		SSE1,				SSE4
 __asm	SSE3_MOVE	SSE7,				[esi + 2*ebx + 1]
 __asm	pminub		SSE2,				SSE4
 __asm	SSE_RMOVE	SSE3,				SSE7
 __asm	pmaxub		SSE4,				SSE6
 __asm	pminub		SSE3,				SSE5
 __asm	pmaxub		SSE0,				SSE2
 __asm	pmaxub		SSE5,				SSE7
 __asm	pminub		SSE1,				SSE4
 __asm	pmaxub		SSE0,				SSE3
 __asm	pminub		SSE1,				SSE5
 __asm	SSE3_MOVE	SSE4,				[esi + 2]
 __asm	SSE_RMOVE	SSE2,				SSE6
 __asm	SSE_RMOVE	SSE3,				SSE7
 __asm	SSE_RMOVE	SSE5,				SSE4
 __asm	pminub		SSE2,				SSE7
 __asm	pmaxub		SSE3,				SSE6
 __asm	pmaxub		SSE0,				SSE2
 __asm	pminub		SSE5,				SSE6
 __asm	pminub		SSE1,				SSE3
 __asm	pmaxub		SSE6,				SSE4
 __asm	pmaxub		SSE0,				SSE5
 __asm	SSE3_MOVE	SSE5,				[esi + 2*ebx]
 __asm	pminub		SSE1,				SSE6
 __asm	SSE_RMOVE	SSE2,				SSE5
 __asm	pminub		SSE5,				SSE7
 __asm	pmaxub		SSE7,				SSE2
 __asm	pmaxub		SSE0,				SSE5
 __asm	pminub		SSE1,				SSE7
 __asm	SSE_RMOVE	SSE5,				SSE2
 __asm	SSE_RMOVE	SSE3,				SSE4
 __asm	SSE3_MOVE	SSE6,				[esi + ebx + 2]
 __asm	pminub		SSE2,				SSE4
 __asm	pmaxub		SSE3,				SSE5
 __asm	SSE_RMOVE	SSE7,				SSE6
 __asm	pmaxub		SSE0,				SSE2
 __asm	pminub		SSE7,				SSE4
 __asm	SSE3_MOVE	SSE2,				[esi + ebx]
 __asm	pminub		SSE1,				SSE3
 __asm	pmaxub		SSE4,				SSE6
 __asm	SSE_RMOVE	SSE3,				SSE2
 __asm	pmaxub		SSE0,				SSE7
 __asm	movd		[edi],				SSE2
 __asm	pminub		SSE3,				SSE5
 __asm	pminub		SSE1,				SSE4
 __asm	pmaxub		SSE5,				SSE2
 __asm	pmaxub		SSE0,				SSE3
 __asm	pminub		SSE1,				SSE5
 __asm	SSE_RMOVE	SSE7,				SSE2
 __asm	SSE_RMOVE	SSE3,				SSE6
 __asm	SSE3_MOVE	SSE4,				[esi + 2*ebx + 2]
 __asm	pminub		SSE2,				SSE6
 __asm	pmaxub		SSE3,				SSE7
 __asm	SSE_RMOVE	SSE5,				SSE4
 __asm	pmaxub		SSE0,				SSE2
 __asm	pminub		SSE5,				SSE6
 __asm	SSE3_MOVE	SSE2,				[esi]
 __asm	pminub		SSE1,				SSE3
 __asm	pmaxub		SSE6,				SSE4
 __asm	pmaxub		SSE0,				SSE5
 __asm	SSE_RMOVE	SSE3,				SSE2
 __asm	pminub		SSE1,				SSE6
 __asm	pminub		SSE2,				SSE7
 __asm	pmaxub		SSE3,				SSE7
 __asm	pmaxub		SSE0,				SSE2
 __asm	pminub		SSE1,				SSE3

 __asm	SSE_RMOVE	SSE2,				SSE0
 #if	(ISSE > 1) || defined(SHLUR)
 __asm	SSE3_MOVE	SSE4,				[esi + ebx + 1]				
 #endif
 __asm	pminub		SSE0,				SSE1
 __asm	pmaxub		SSE2,				SSE1
 #ifdef	SHLUR
 		sharpen(SSE4, SSE0, SSE2, rshift[eax], shift_mask[eax], SSE7, SSE3)
 __asm	SSE_MOVE	[edi + 1],			SSE4
 #else
 #if	ISSE > 1
 __asm	pmaxub		SSE0,				SSE4
 #else
 __asm	pmaxub		SSE0,				[esi + ebx + 1]	
 #endif
 __asm	pminub		SSE0,				SSE2
 __asm	SSE_MOVE	[edi + 1],			SSE0
 #endif	// SHLUR
 // now the pixels in the middle
 __asm	add			esi,				SSE_INCREMENT
 __asm	add			edi,				SSE_INCREMENT + 1
 __asm	mov			ecx,				hblocks
 #endif // MODIFYPLUGIN
 __asm	align		16
 __asm	middle_loop:
 __asm	SSE3_MOVE	SSE4,				[esi]
 __asm	SSE3_MOVE	SSE5,				[esi + 2*ebx + 2]
 __asm	SSE_RMOVE	SSE0,				SSE4
 __asm	SSE_RMOVE	SSE1,				SSE5
 __asm	SSE3_MOVE	SSE6,				[esi + 1]
 __asm	pminub		SSE0,				SSE5
 __asm	SSE_RMOVE	SSE2,				SSE6
 __asm	pmaxub		SSE1,				SSE4
 __asm	SSE3_MOVE	SSE7,				[esi + 2*ebx + 1]
 __asm	pminub		SSE2,				SSE4
 __asm	SSE_RMOVE	SSE3,				SSE7
 __asm	pmaxub		SSE4,				SSE6
 __asm	pminub		SSE3,				SSE5
 __asm	pmaxub		SSE0,				SSE2
 __asm	pmaxub		SSE5,				SSE7
 __asm	pminub		SSE1,				SSE4
 __asm	pmaxub		SSE0,				SSE3
 __asm	pminub		SSE1,				SSE5
 __asm	SSE3_MOVE	SSE4,				[esi + 2]
 __asm	SSE_RMOVE	SSE2,				SSE6
 __asm	SSE_RMOVE	SSE3,				SSE7
 __asm	SSE_RMOVE	SSE5,				SSE4
 __asm	pminub		SSE2,				SSE7
 __asm	pmaxub		SSE3,				SSE6
 __asm	pmaxub		SSE0,				SSE2
 __asm	pminub		SSE5,				SSE6
 __asm	pminub		SSE1,				SSE3
 __asm	pmaxub		SSE6,				SSE4
 __asm	pmaxub		SSE0,				SSE5
 __asm	SSE3_MOVE	SSE5,				[esi + 2*ebx]
 __asm	pminub		SSE1,				SSE6
 __asm	SSE_RMOVE	SSE2,				SSE5
 __asm	pminub		SSE5,				SSE7
 __asm	pmaxub		SSE7,				SSE2
 __asm	pmaxub		SSE0,				SSE5
 __asm	pminub		SSE1,				SSE7
 __asm	SSE_RMOVE	SSE5,				SSE2
 __asm	SSE_RMOVE	SSE3,				SSE4
 __asm	SSE3_MOVE	SSE6,				[esi + ebx + 2]
 __asm	pminub		SSE2,				SSE4
 __asm	pmaxub		SSE3,				SSE5
 __asm	SSE_RMOVE	SSE7,				SSE6
 __asm	pmaxub		SSE0,				SSE2
 __asm	pminub		SSE7,				SSE4
 __asm	SSE3_MOVE	SSE2,				[esi + ebx]
 __asm	pminub		SSE1,				SSE3
 __asm	pmaxub		SSE4,				SSE6
 __asm	SSE_RMOVE	SSE3,				SSE2
 __asm	pmaxub		SSE0,				SSE7
 __asm	pminub		SSE3,				SSE5
 __asm	pminub		SSE1,				SSE4
 __asm	pmaxub		SSE5,				SSE2
 __asm	pmaxub		SSE0,				SSE3
 __asm	pminub		SSE1,				SSE5
 __asm	SSE_RMOVE	SSE7,				SSE2
 __asm	SSE_RMOVE	SSE3,				SSE6
 __asm	SSE3_MOVE	SSE4,				[esi + 2*ebx + 2]
 __asm	pminub		SSE2,				SSE6
 __asm	pmaxub		SSE3,				SSE7
 __asm	SSE_RMOVE	SSE5,				SSE4
 __asm	pmaxub		SSE0,				SSE2
 __asm	pminub		SSE5,				SSE6
 __asm	SSE3_MOVE	SSE2,				[esi]
 __asm	pminub		SSE1,				SSE3
 __asm	pmaxub		SSE6,				SSE4
 __asm	pmaxub		SSE0,				SSE5
 __asm	SSE_RMOVE	SSE3,				SSE2
 __asm	pminub		SSE1,				SSE6
 __asm	pminub		SSE2,				SSE7
 __asm	pmaxub		SSE3,				SSE7
 __asm	pmaxub		SSE0,				SSE2
 __asm	pminub		SSE1,				SSE3

 __asm	SSE_RMOVE	SSE2,				SSE0

 #if		(ISSE > 1) || defined(SHLUR)
 #ifdef	MODIFYPLUGIN
 __asm	SSE3_MOVE	SSE4,				[edi]
 #else
 __asm	SSE3_MOVE	SSE4,				[esi + ebx + 1]	
 #endif
 #endif
 #if		MODIFYPLUGIN > 0
 __asm	SSE3_MOVE	SSE5,				[esi + ebx + 1]		
 #endif

 __asm	pminub		SSE0,				SSE1
 __asm	pmaxub		SSE2,				SSE1
 #if		MODIFYPLUGIN > 0
 __asm	pminub		SSE0,				SSE5
 __asm	pmaxub		SSE2,				SSE5
 #endif

 #ifdef	SHLUR
 		sharpen(SSE4, SSE0, SSE2, rshift[eax], shift_mask[eax], SSE7, SSE3)
 __asm	add			esi,				SSE_INCREMENT
 __asm	SSE_MOVE	[edi],				SSE4
 #else
 #if	ISSE > 1
 __asm	pmaxub		SSE0,				SSE4
 #else
 #ifdef	MODIFYPLUGIN
 __asm	pmaxub		SSE0,				[edi]
 #else
 __asm	pmaxub		SSE0,				[esi + ebx + 1]	
 #endif
 #endif
 __asm	pminub		SSE0,				SSE2
 __asm	add			esi,				SSE_INCREMENT
 __asm	SSE_MOVE	[edi],				SSE0
 #endif	// SHLUR
 __asm	add			edi,				SSE_INCREMENT
 __asm	dec			ecx
 __asm	jnz			middle_loop
 // the last pixels
 __asm	add			esi,				edx
 __asm	SSE3_MOVE	SSE4,				[esi]
 __asm	add			edi,				edx
 __asm	SSE3_MOVE	SSE5,				[esi + 2*ebx + 2]
 __asm	SSE_RMOVE	SSE0,				SSE4
 __asm	SSE_RMOVE	SSE1,				SSE5
 __asm	SSE3_MOVE	SSE6,				[esi + 1]
 __asm	pminub		SSE0,				SSE5
 __asm	SSE_RMOVE	SSE2,				SSE6
 __asm	pmaxub		SSE1,				SSE4
 __asm	SSE3_MOVE	SSE7,				[esi + 2*ebx + 1]
 __asm	pminub		SSE2,				SSE4
 __asm	SSE_RMOVE	SSE3,				SSE7
 __asm	pmaxub		SSE4,				SSE6
 __asm	pminub		SSE3,				SSE5
 __asm	pmaxub		SSE0,				SSE2
 __asm	pmaxub		SSE5,				SSE7
 __asm	pminub		SSE1,				SSE4
 __asm	pmaxub		SSE0,				SSE3
 __asm	pminub		SSE1,				SSE5
 __asm	SSE3_MOVE	SSE4,				[esi + 2]
 __asm	SSE_RMOVE	SSE2,				SSE6
 __asm	SSE_RMOVE	SSE3,				SSE7
 __asm	SSE_RMOVE	SSE5,				SSE4
 __asm	pminub		SSE2,				SSE7
 __asm	pmaxub		SSE3,				SSE6
 __asm	pmaxub		SSE0,				SSE2
 __asm	pminub		SSE5,				SSE6
 __asm	pminub		SSE1,				SSE3
 __asm	pmaxub		SSE6,				SSE4
 __asm	pmaxub		SSE0,				SSE5
 __asm	SSE3_MOVE	SSE5,				[esi + 2*ebx]
 __asm	pminub		SSE1,				SSE6
 __asm	SSE_RMOVE	SSE2,				SSE5
 __asm	pminub		SSE5,				SSE7
 __asm	pmaxub		SSE7,				SSE2
 __asm	pmaxub		SSE0,				SSE5
 __asm	pminub		SSE1,				SSE7
 __asm	SSE_RMOVE	SSE5,				SSE2
 __asm	SSE_RMOVE	SSE3,				SSE4
 __asm	SSE3_MOVE	SSE6,				[esi + ebx + 2]
 __asm	pminub		SSE2,				SSE4
 __asm	pmaxub		SSE3,				SSE5
 __asm	SSE_RMOVE	SSE7,				SSE6
 __asm	pmaxub		SSE0,				SSE2
 __asm	SSE_MOVE	[edi + 1],			SSE6
 __asm	pminub		SSE7,				SSE4
 __asm	SSE3_MOVE	SSE2,				[esi + ebx]
 __asm	pminub		SSE1,				SSE3
 __asm	pmaxub		SSE4,				SSE6
 __asm	SSE_RMOVE	SSE3,				SSE2
 __asm	pmaxub		SSE0,				SSE7
 __asm	pminub		SSE3,				SSE5
 __asm	pminub		SSE1,				SSE4
 __asm	pmaxub		SSE5,				SSE2
 __asm	pmaxub		SSE0,				SSE3
 __asm	pminub		SSE1,				SSE5
 __asm	SSE_RMOVE	SSE7,				SSE2
 __asm	SSE_RMOVE	SSE3,				SSE6
 __asm	SSE3_MOVE	SSE4,				[esi + 2*ebx + 2]
 __asm	pminub		SSE2,				SSE6
 __asm	pmaxub		SSE3,				SSE7
 __asm	SSE_RMOVE	SSE5,				SSE4
 __asm	pmaxub		SSE0,				SSE2
 __asm	pminub		SSE5,				SSE6
 __asm	SSE3_MOVE	SSE2,				[esi]
 __asm	pminub		SSE1,				SSE3
 __asm	pmaxub		SSE6,				SSE4
 __asm	pmaxub		SSE0,				SSE5
 __asm	SSE_RMOVE	SSE3,				SSE2
 __asm	pminub		SSE1,				SSE6
 __asm	pminub		SSE2,				SSE7
 __asm	pmaxub		SSE3,				SSE7
 __asm	pmaxub		SSE0,				SSE2
 __asm	pminub		SSE1,				SSE3

 __asm	SSE_RMOVE	SSE2,				SSE0

 #if		(ISSE > 1) || defined(SHLUR)
 #ifdef	MODIFYPLUGIN
 __asm	SSE3_MOVE	SSE4,				[edi]
 #else
 __asm	SSE3_MOVE	SSE4,				[esi + ebx + 1]	
 #endif
 #endif
 #if		MODIFYPLUGIN > 0
 __asm	SSE3_MOVE	SSE5,				[esi + ebx + 1]		
 #endif
 __asm	pminub		SSE0,				SSE1
 __asm	pmaxub		SSE2,				SSE1
 #if		MODIFYPLUGIN > 0
 __asm	pminub		SSE0,				SSE5
 __asm	pmaxub		SSE2,				SSE5
 #endif

 #ifdef	SHLUR
 		sharpen(SSE4, SSE0, SSE2, rshift[eax], shift_mask[eax], SSE7, SSE3)
 #ifdef	SHARPEN
 __asm	add			esi,				spitch
 #else
 __asm	add			esi,				eax
 #endif
 __asm	SSE_MOVE	[edi],				SSE4
 #else
 #if	ISSE > 1
 __asm	pmaxub		SSE0,				SSE4
 #else
 #ifdef	MODIFYPLUGIN
 __asm	pmaxub		SSE0,				[edi]
 #else
 __asm	pmaxub		SSE0,				[esi + ebx + 1]	
 #endif
 #endif
 __asm	pminub		SSE0,				SSE2
 __asm	add			esi,				eax
 __asm	SSE_MOVE	[edi],				SSE0
 #endif	// SHLUR
 __asm	add			edi,				dpitch
 __asm	dec			height
 #ifdef	MODIFYPLUGIN
 __asm	mov			ecx,				hblocks
 __asm	jnz			middle_loop
 #else
 __asm	jnz			column_loop
 #endif
 }

 #ifdef	SHARPEN
 void	SmartRGCL2(BYTE *dp, int dpitch, const BYTE *_sp, int spitch, int hblocks, int remainder, int incpitch, int height, int strength)
 #else
 void	SmartRGCL2(BYTE *dp, int dpitch, const BYTE *_sp, int spitch, int hblocks, int remainder, int incpitch, int height)
 #endif
 {
 __asm	mov			eax,				hblocks
 __asm	mov			ebx,				spitch
 #ifdef	MODIFYPLUGIN
 __asm	mov			ecx,				eax
 #endif
 __asm	mov			edx,				remainder
 #if		SSE_INCREMENT == 16
 __asm	add			eax,				eax
 #endif
 __asm	mov			esi,				_sp
 #ifdef	MODIFYPLUGIN
 __asm	lea			eax,				[eax * 8 + edx]
 #else
 __asm	lea			eax,				[eax * 8 + edx + SSE_INCREMENT + 1]
 #endif
 __asm	sub			esi,				ebx
 __asm	sub			dpitch,				eax
 __asm	neg			eax
 __asm	mov			edi,				dp
 #ifdef	MODIFYPLUGIN
 __asm	inc			edi
 __asm	lea			eax,				[ebx + eax]	
 #else
 __asm	lea			eax,				[ebx + eax + 1]	
 #ifdef	SHARPEN
 __asm	mov			spitch,				eax
 __asm	mov			eax,				strength
 #endif
 __asm	align		16
 __asm	column_loop:
 __asm	SSE3_MOVE	SSE7,				[esi]
 __asm	SSE3_MOVE	SSE6,				[esi + 1]
 __asm	SSE_RMOVE	SSE0,				SSE7
 __asm	SSE_RMOVE	SSE1,				SSE6
 __asm	SSE3_MOVE	SSE5,				[esi + 2]
 __asm	pminub		SSE0,				SSE6
 __asm	SSE_RMOVE	SSE2,				SSE5
 __asm	pmaxub		SSE1,				SSE7
 __asm	pminub		SSE2,				SSE6
 __asm	SSE3_MOVE	SSE4,				[esi + ebx + 2]
 __asm	pmaxub		SSE6,				SSE5
 __asm	pmaxub		SSE0,				SSE2
 __asm	SSE_RMOVE	SSE3,				SSE4
 __asm	pminub		SSE1,				SSE6
 __asm	pminub		SSE3,				SSE5
 __asm	SSE3_MOVE	SSE6,				[esi + 2*ebx + 2]
 __asm	pmaxub		SSE5,				SSE4
 __asm	pmaxub		SSE0,				SSE3
 __asm	SSE_RMOVE	SSE2,				SSE6
 __asm	pminub		SSE1,				SSE5
 __asm	pminub		SSE2,				SSE4
 __asm	SSE3_MOVE	SSE5,				[esi + 2*ebx + 1]
 __asm	pmaxub		SSE4,				SSE6
 __asm	pmaxub		SSE0,				SSE2
 __asm	SSE_RMOVE	SSE3,				SSE5
 __asm	pminub		SSE1,				SSE4
 __asm	pminub		SSE3,				SSE6
 __asm	SSE3_MOVE	SSE4,				[esi + 2*ebx]
 __asm	pmaxub		SSE6,				SSE5
 __asm	pmaxub		SSE0,				SSE3
 __asm	SSE_RMOVE	SSE2,				SSE4
 __asm	pminub		SSE1,				SSE6
 __asm	pminub		SSE2,				SSE5
 __asm	SSE3_MOVE	SSE6,				[esi + ebx]
 __asm	pmaxub		SSE5,				SSE4
 __asm	pmaxub		SSE0,				SSE2
 __asm	SSE_RMOVE	SSE3,				SSE6
 __asm	pminub		SSE1,				SSE5
 __asm	movd		[edi],				SSE6
 __asm	pminub		SSE3,				SSE4
 __asm	SSE_RMOVE	SSE2,				SSE7
 __asm	pmaxub		SSE4,				SSE6
 __asm	pmaxub		SSE0,				SSE3
 __asm	pminub		SSE1,				SSE4
 __asm	pminub		SSE2,				SSE6
 __asm	pmaxub		SSE7,				SSE6
 __asm	pmaxub		SSE0,				SSE2
 __asm	pminub		SSE1,				SSE7

 __asm	SSE3_MOVE	SSE2,				[esi]
 __asm	SSE3_MOVE	SSE7,				[esi + 2*ebx + 2]
 __asm	SSE_RMOVE	SSE3,				SSE2
 __asm	SSE3_MOVE	SSE4,				[esi + 2]
 __asm	pminub		SSE2,				SSE7
 __asm	SSE3_MOVE	SSE6,				[esi + 2*ebx]
 __asm	SSE_RMOVE	SSE5,				SSE4
 __asm	pmaxub		SSE3,				SSE7
 __asm	pmaxub		SSE0,				SSE2
 __asm	pminub		SSE1,				SSE3
 __asm	pminub		SSE4,				SSE6
 __asm	SSE3_MOVE	SSE2,				[esi + 1]
 __asm	pmaxub		SSE5,				SSE6
 __asm	pmaxub		SSE0,				SSE4
 __asm	SSE3_MOVE	SSE7,				[esi + 2*ebx + 1]
 __asm	SSE_RMOVE	SSE3,				SSE2
 __asm	pminub		SSE1,				SSE5
 __asm	pminub		SSE2,				SSE7
 __asm	SSE3_MOVE	SSE4,				[esi + ebx]
 __asm	pmaxub		SSE3,				SSE7
 __asm	pmaxub		SSE0,				SSE2
 __asm	SSE3_MOVE	SSE7,				[esi + ebx + 2]
 __asm	SSE_RMOVE	SSE5,				SSE4
 __asm	pminub		SSE1,				SSE3
 __asm	pminub		SSE4,				SSE7
 __asm	pmaxub		SSE5,				SSE7
 __asm	pmaxub		SSE0,				SSE4
 __asm	pminub		SSE1,				SSE5

 __asm	SSE_RMOVE	SSE2,				SSE0
 #if	(ISSE > 1) || defined(SHLUR)
 __asm	SSE3_MOVE	SSE4,				[esi + ebx + 1]				
 #endif
 __asm	pminub		SSE0,				SSE1
 __asm	pmaxub		SSE2,				SSE1
 #ifdef	SHLUR
 		sharpen(SSE4, SSE0, SSE2, rshift[eax], shift_mask[eax], SSE7, SSE3)
 __asm	SSE_MOVE	[edi + 1],			SSE4
 #else
 #if	ISSE > 1
 __asm	pmaxub		SSE0,				SSE4
 #else
 __asm	pmaxub		SSE0,				[esi + ebx + 1]	
 #endif
 __asm	pminub		SSE0,				SSE2
 __asm	SSE_MOVE	[edi + 1],			SSE0
 #endif	// SHLUR
 // now the pixels in the middle
 __asm	add			esi,				SSE_INCREMENT
 __asm	add			edi,				SSE_INCREMENT + 1
 __asm	mov			ecx,				hblocks
 #endif // MODIFYPLUGIN
 __asm	align		16
 __asm	middle_loop:
 __asm	SSE3_MOVE	SSE7,				[esi]
 __asm	SSE3_MOVE	SSE6,				[esi + 1]
 __asm	SSE_RMOVE	SSE0,				SSE7
 __asm	SSE_RMOVE	SSE1,				SSE6
 __asm	SSE3_MOVE	SSE5,				[esi + 2]
 __asm	pminub		SSE0,				SSE6
 __asm	SSE_RMOVE	SSE2,				SSE5
 __asm	pmaxub		SSE1,				SSE7
 __asm	pminub		SSE2,				SSE6
 __asm	SSE3_MOVE	SSE4,				[esi + ebx + 2]
 __asm	pmaxub		SSE6,				SSE5
 __asm	pmaxub		SSE0,				SSE2
 __asm	SSE_RMOVE	SSE3,				SSE4
 __asm	pminub		SSE1,				SSE6
 __asm	pminub		SSE3,				SSE5
 __asm	SSE3_MOVE	SSE6,				[esi + 2*ebx + 2]
 __asm	pmaxub		SSE5,				SSE4
 __asm	pmaxub		SSE0,				SSE3
 __asm	SSE_RMOVE	SSE2,				SSE6
 __asm	pminub		SSE1,				SSE5
 __asm	pminub		SSE2,				SSE4
 __asm	SSE3_MOVE	SSE5,				[esi + 2*ebx + 1]
 __asm	pmaxub		SSE4,				SSE6
 __asm	pmaxub		SSE0,				SSE2
 __asm	SSE_RMOVE	SSE3,				SSE5
 __asm	pminub		SSE1,				SSE4
 __asm	pminub		SSE3,				SSE6
 __asm	SSE3_MOVE	SSE4,				[esi + 2*ebx]
 __asm	pmaxub		SSE6,				SSE5
 __asm	pmaxub		SSE0,				SSE3
 __asm	SSE_RMOVE	SSE2,				SSE4
 __asm	pminub		SSE1,				SSE6
 __asm	pminub		SSE2,				SSE5
 __asm	SSE3_MOVE	SSE6,				[esi + ebx]
 __asm	pmaxub		SSE5,				SSE4
 __asm	pmaxub		SSE0,				SSE2
 __asm	SSE_RMOVE	SSE3,				SSE6
 __asm	pminub		SSE1,				SSE5
 __asm	pminub		SSE3,				SSE4
 __asm	SSE_RMOVE	SSE2,				SSE7
 __asm	pmaxub		SSE4,				SSE6
 __asm	pmaxub		SSE0,				SSE3
 __asm	pminub		SSE1,				SSE4
 __asm	pminub		SSE2,				SSE6
 __asm	pmaxub		SSE7,				SSE6
 __asm	pmaxub		SSE0,				SSE2
 __asm	pminub		SSE1,				SSE7

 __asm	SSE3_MOVE	SSE2,				[esi]
 __asm	SSE3_MOVE	SSE7,				[esi + 2*ebx + 2]
 __asm	SSE_RMOVE	SSE3,				SSE2
 __asm	SSE3_MOVE	SSE4,				[esi + 2]
 __asm	pminub		SSE2,				SSE7
 __asm	SSE3_MOVE	SSE6,				[esi + 2*ebx]
 __asm	SSE_RMOVE	SSE5,				SSE4
 __asm	pmaxub		SSE3,				SSE7
 __asm	pmaxub		SSE0,				SSE2
 __asm	pminub		SSE1,				SSE3
 __asm	pminub		SSE4,				SSE6
 __asm	SSE3_MOVE	SSE2,				[esi + 1]
 __asm	pmaxub		SSE5,				SSE6
 __asm	pmaxub		SSE0,				SSE4
 __asm	SSE3_MOVE	SSE7,				[esi + 2*ebx + 1]
 __asm	SSE_RMOVE	SSE3,				SSE2
 __asm	pminub		SSE1,				SSE5
 __asm	pminub		SSE2,				SSE7
 __asm	SSE3_MOVE	SSE4,				[esi + ebx]
 __asm	pmaxub		SSE3,				SSE7
 __asm	pmaxub		SSE0,				SSE2
 __asm	SSE3_MOVE	SSE7,				[esi + ebx + 2]
 __asm	SSE_RMOVE	SSE5,				SSE4
 __asm	pminub		SSE1,				SSE3
 __asm	pminub		SSE4,				SSE7
 __asm	pmaxub		SSE5,				SSE7
 __asm	pmaxub		SSE0,				SSE4
 __asm	pminub		SSE1,				SSE5

 __asm	SSE_RMOVE	SSE2,				SSE0
 #if		(ISSE > 1) || defined(SHLUR)
 #ifdef	MODIFYPLUGIN
 __asm	SSE3_MOVE	SSE4,				[edi]
 #else
 __asm	SSE3_MOVE	SSE4,				[esi + ebx + 1]	
 #endif
 #endif
 #if		MODIFYPLUGIN > 0
 __asm	SSE3_MOVE	SSE5,				[esi + ebx + 1]		
 #endif

 __asm	pminub		SSE0,				SSE1
 __asm	pmaxub		SSE2,				SSE1
 #if		MODIFYPLUGIN > 0
 __asm	pminub		SSE0,				SSE5
 __asm	pmaxub		SSE2,				SSE5
 #endif

 #ifdef	SHLUR
 		sharpen(SSE4, SSE0, SSE2, rshift[eax], shift_mask[eax], SSE7, SSE3)
 __asm	add			esi,				SSE_INCREMENT
 __asm	SSE_MOVE	[edi],				SSE4
 #else
 #if	ISSE > 1
 __asm	pmaxub		SSE0,				SSE4
 #else
 #ifdef	MODIFYPLUGIN
 __asm	pmaxub		SSE0,				[edi]
 #else
 __asm	pmaxub		SSE0,				[esi + ebx + 1]	
 #endif
 #endif
 __asm	pminub		SSE0,				SSE2
 __asm	add			esi,				SSE_INCREMENT
 __asm	SSE_MOVE	[edi],				SSE0
 #endif	// SHLUR
 __asm	add			edi,				SSE_INCREMENT
 __asm	dec			ecx
 __asm	jnz			middle_loop
 // the last pixels
 __asm	add			esi,				edx
 __asm	add			edi,				edx
 __asm	SSE3_MOVE	SSE7,				[esi]
 __asm	SSE3_MOVE	SSE6,				[esi + 1]
 __asm	SSE_RMOVE	SSE0,				SSE7
 __asm	SSE_RMOVE	SSE1,				SSE6
 __asm	SSE3_MOVE	SSE5,				[esi + 2]
 __asm	pminub		SSE0,				SSE6
 __asm	SSE_RMOVE	SSE2,				SSE5
 __asm	pmaxub		SSE1,				SSE7
 __asm	pminub		SSE2,				SSE6
 __asm	SSE3_MOVE	SSE4,				[esi + ebx + 2]
 __asm	pmaxub		SSE6,				SSE5
 __asm	pmaxub		SSE0,				SSE2
 __asm	SSE_RMOVE	SSE3,				SSE4
 __asm	pminub		SSE1,				SSE6
 __asm	pminub		SSE3,				SSE5
 __asm	SSE3_MOVE	SSE6,				[esi + 2*ebx + 2]
 __asm	pmaxub		SSE5,				SSE4
 __asm	pmaxub		SSE0,				SSE3
 __asm	SSE_RMOVE	SSE2,				SSE6
 __asm	pminub		SSE1,				SSE5
 __asm	pminub		SSE2,				SSE4
 __asm	SSE3_MOVE	SSE5,				[esi + 2*ebx + 1]
 __asm	pmaxub		SSE4,				SSE6
 __asm	pmaxub		SSE0,				SSE2
 __asm	SSE_RMOVE	SSE3,				SSE5
 __asm	pminub		SSE1,				SSE4
 __asm	pminub		SSE3,				SSE6
 __asm	SSE3_MOVE	SSE4,				[esi + 2*ebx]
 __asm	pmaxub		SSE6,				SSE5
 __asm	pmaxub		SSE0,				SSE3
 __asm	SSE_RMOVE	SSE2,				SSE4
 __asm	pminub		SSE1,				SSE6
 __asm	pminub		SSE2,				SSE5
 __asm	SSE3_MOVE	SSE6,				[esi + ebx]
 __asm	pmaxub		SSE5,				SSE4
 __asm	pmaxub		SSE0,				SSE2
 __asm	SSE_RMOVE	SSE3,				SSE6
 __asm	pminub		SSE1,				SSE5
 __asm	pminub		SSE3,				SSE4
 __asm	SSE_RMOVE	SSE2,				SSE7
 __asm	pmaxub		SSE4,				SSE6
 __asm	pmaxub		SSE0,				SSE3
 __asm	pminub		SSE1,				SSE4
 __asm	pminub		SSE2,				SSE6
 __asm	pmaxub		SSE7,				SSE6
 __asm	pmaxub		SSE0,				SSE2
 __asm	pminub		SSE1,				SSE7

 __asm	SSE3_MOVE	SSE2,				[esi]
 __asm	SSE3_MOVE	SSE7,				[esi + 2*ebx + 2]
 __asm	SSE_RMOVE	SSE3,				SSE2
 __asm	SSE3_MOVE	SSE4,				[esi + 2]
 __asm	pminub		SSE2,				SSE7
 __asm	SSE3_MOVE	SSE6,				[esi + 2*ebx]
 __asm	SSE_RMOVE	SSE5,				SSE4
 __asm	pmaxub		SSE3,				SSE7
 __asm	pmaxub		SSE0,				SSE2
 __asm	pminub		SSE1,				SSE3
 __asm	pminub		SSE4,				SSE6
 __asm	SSE3_MOVE	SSE2,				[esi + 1]
 __asm	pmaxub		SSE5,				SSE6
 __asm	pmaxub		SSE0,				SSE4
 __asm	SSE3_MOVE	SSE7,				[esi + 2*ebx + 1]
 __asm	SSE_RMOVE	SSE3,				SSE2
 __asm	pminub		SSE1,				SSE5
 __asm	pminub		SSE2,				SSE7
 __asm	SSE3_MOVE	SSE4,				[esi + ebx]
 __asm	pmaxub		SSE3,				SSE7
 __asm	pmaxub		SSE0,				SSE2
 __asm	SSE3_MOVE	SSE7,				[esi + ebx + 2]
 __asm	SSE_RMOVE	SSE5,				SSE4
 __asm	pminub		SSE1,				SSE3
 __asm	pminub		SSE4,				SSE7
 __asm	pmaxub		SSE5,				SSE7
 __asm	pmaxub		SSE0,				SSE4
 __asm	SSE_MOVE	[edi + 1],			SSE7
 __asm	pminub		SSE1,				SSE5

 __asm	SSE_RMOVE	SSE2,				SSE0

 #if		(ISSE > 1) || defined(SHLUR)
 #ifdef	MODIFYPLUGIN
 __asm	SSE3_MOVE	SSE4,				[edi]
 #else
 __asm	SSE3_MOVE	SSE4,				[esi + ebx + 1]	
 #endif
 #endif
 #if		MODIFYPLUGIN > 0
 __asm	SSE3_MOVE	SSE5,				[esi + ebx + 1]		
 #endif
 __asm	pminub		SSE0,				SSE1
 __asm	pmaxub		SSE2,				SSE1
 #if		MODIFYPLUGIN > 0
 __asm	pminub		SSE0,				SSE5
 __asm	pmaxub		SSE2,				SSE5
 #endif

 #ifdef	SHLUR
 		sharpen(SSE4, SSE0, SSE2, rshift[eax], shift_mask[eax], SSE7, SSE3)
 #ifdef	SHARPEN
 __asm	add			esi,				spitch
 #else
 __asm	add			esi,				eax
 #endif
 __asm	SSE_MOVE	[edi],				SSE4
 #else
 #if	ISSE > 1
 __asm	pmaxub		SSE0,				SSE4
 #else
 #ifdef	MODIFYPLUGIN
 __asm	pmaxub		SSE0,				[edi]
 #else
 __asm	pmaxub		SSE0,				[esi + ebx + 1]	
 #endif
 #endif
 __asm	pminub		SSE0,				SSE2
 __asm	add			esi,				eax
 __asm	SSE_MOVE	[edi],				SSE0
 #endif	// SHLUR
 __asm	add			edi,				dpitch
 __asm	dec			height
 #ifdef	MODIFYPLUGIN
 __asm	mov			ecx,				hblocks
 __asm	jnz			middle_loop
 #else
 __asm	jnz			column_loop
 #endif
 }

 #ifndef	MODIFYPLUGIN

 #define	SmartAvgs(upavg, downavg, mem1, mem2, bias, reg2)\
 __asm	SSE3_MOVE	downavg,			mem1		\
 __asm	SSE3_MOVE	reg2,				mem2		\
 __asm	SSE_RMOVE	upavg,				downavg		\
 __asm	psubusb		downavg,			bias		\
 __asm	pavgb		upavg,				reg2		\
 __asm	pavgb		downavg,			reg2

 #ifdef	MODIFYPLUGIN
 #define	SmartAvgsW(upavg, downavg, wmem, mem1, mem2, bias, reg2)\
 		SmartAvgs(upavg, downavg, mem1, mem2, bias, reg2)
 #else
 #define	SmartAvgsW(upavg, downavg, wmem, mem1, mem2, bias, reg2, mwrite)\
 __asm	SSE3_MOVE	downavg,			mem1		\
 __asm	SSE3_MOVE	reg2,				mem2		\
 __asm	SSE_RMOVE	upavg,				downavg		\
 __asm	psubusb		downavg,			bias		\
 __asm	pavgb		upavg,				reg2		\
 __asm	pavgb		downavg,			reg2		\
 __asm	mwrite		wmem,				reg2
 #endif

 #ifdef	SHARPEN
 void	SmartAvgRGs(BYTE *dp, int dpitch, const BYTE *_sp, int spitch, int hblocks, int remainder, int incpitch, int height, int strength)
 #else
 void	SmartAvgRGs(BYTE *dp, int dpitch, const BYTE *_sp, int spitch, int hblocks, int remainder, int incpitch, int height)
 #endif
 {
 #ifdef	SHARPEN
 // SSE5 is also used for a different purpose #ifdef MODIFYPLUGIN
 __asm	mov			ebx,				strength
 __asm	SSE_RMOVE	SSE4,				rshift[ebx]
 __asm	SSE_RMOVE	SSE5,				shift_mask[ebx]
 #endif
 __asm	mov			eax,				hblocks
 __asm	mov			ebx,				spitch
 __asm	SSE_RMOVE	SSE7,				fconvolution_bias				
 #ifdef	MODIFYPLUGIN
 __asm	mov			ecx,				eax
 #endif
 __asm	mov			edx,				remainder
 #if		SSE_INCREMENT == 16
 __asm	add			eax,				eax
 #endif
 __asm	mov			esi,				_sp
 #ifdef	MODIFYPLUGIN
 __asm	lea			eax,				[eax * 8 + edx]
 #else
 __asm	lea			eax,				[eax * 8 + edx + SSE_INCREMENT + 1]
 #endif
 __asm	sub			esi,				ebx
 __asm	sub			dpitch,				eax
 __asm	neg			eax
 __asm	mov			edi,				dp
 #ifdef	MODIFYPLUGIN
 __asm	inc			edi
 __asm	lea			eax,				[ebx + eax]	
 #else
 __asm	lea			eax,				[ebx + eax + 1]	
 __asm	align		16
 __asm	column_loop:
 		SmartAvgs(SSE0, SSE1, [esi], [esi + 2*ebx + 2], SSE7, SSE6)
 		SmartAvgs(SSE2, SSE3, [esi + 2], [esi + 2*ebx], SSE7, SSE6)
 __asm	pmaxub		SSE0,				SSE2
 __asm	pminub		SSE1,				SSE3
 		SmartAvgs(SSE2, SSE3, [esi + 1], [esi + 2*ebx + 1], SSE7, SSE6)
 __asm	pmaxub		SSE0,				SSE2
 __asm	pminub		SSE1,				SSE3
 		SmartAvgsW(SSE2, SSE3, [edi], [esi + ebx + 2], [esi + ebx], SSE7, SSE6, movd)
 __asm	pmaxub		SSE0,				SSE2
 #if		(ISSE > 1) || defined(SHLUR)
 __asm	SSE3_MOVE	SSE6,				[esi + ebx + 1]				
 #endif
 __asm	pminub		SSE1,				SSE3
 #ifdef	SHLUR
 		sharpen(SSE6, SSE1, SSE0, SSE4, SSE5, SSE3, SSE2)
 __asm	SSE_MOVE	[edi + 1],			SSE6
 #else
 #if	ISSE > 1
 __asm	pminub		SSE0,				SSE6
 #else
 __asm	pminub		SSE0,				[esi + ebx + 1]	
 #endif
 __asm	pmaxub		SSE0,				SSE1
 __asm	SSE_MOVE	[edi + 1],			SSE0
 #endif	// SHLUR
 // now the pixels in the middle
 __asm	add			esi,				SSE_INCREMENT
 __asm	add			edi,				SSE_INCREMENT + 1
 __asm	mov			ecx,				hblocks
 #endif // MODIFYPLUGIN
 __asm	align		16
 __asm	middle_loop:
 		SmartAvgs(SSE0, SSE1, [esi], [esi + 2*ebx + 2], SSE7, SSE6)
 		SmartAvgs(SSE2, SSE3, [esi + 2], [esi + 2*ebx], SSE7, SSE6)
 __asm	pmaxub		SSE0,				SSE2
 __asm	pminub		SSE1,				SSE3
 		SmartAvgs(SSE2, SSE3, [esi + 1], [esi + 2*ebx + 1], SSE7, SSE6)
 __asm	pmaxub		SSE0,				SSE2
 __asm	pminub		SSE1,				SSE3
 		SmartAvgs(SSE2, SSE3, [esi + ebx], [esi + ebx + 2], SSE7, SSE6)
 #ifdef		MODIFYPLUGIN
 __asm	SSE3_MOVE	SSE5,				[esi + ebx + 1]		
 #endif
 __asm	pmaxub		SSE0,				SSE2
 #if		(ISSE > 1) || defined(SHLUR)
 #ifdef	MODIFYPLUGIN
 __asm	SSE3_MOVE	SSE6,				[edi]
 #else
 __asm	SSE3_MOVE	SSE6,				[esi + ebx + 1]	
 #endif
 #endif
 __asm	pminub		SSE1,				SSE3
 #ifdef		MODIFYPLUGIN
 __asm	pmaxub		SSE0,				SSE5
 __asm	pminub		SSE1,				SSE5
 #endif
 #ifdef	SHLUR
 		sharpen(SSE6, SSE1, SSE0, SSE4, SSE5, SSE3, SSE2)
 __asm	add			esi,				SSE_INCREMENT
 __asm	SSE_MOVE	[edi],				SSE6
 #else
 #if	ISSE > 1
 __asm	pminub		SSE0,				SSE6
 #else
 __asm	pminub		SSE0,				[esi + ebx + 1]	
 #endif
 __asm	pmaxub		SSE0,				SSE1
 __asm	add			esi,				SSE_INCREMENT
 __asm	SSE_MOVE	[edi],				SSE0
 #endif	// SHLUR
 __asm	add			edi,				SSE_INCREMENT
 __asm	dec			ecx
 __asm	jnz			middle_loop
 // the last pixels
 __asm	add			esi,				edx
 __asm	add			edi,				edx
 		SmartAvgs(SSE0, SSE1, [esi], [esi + 2*ebx + 2], SSE7, SSE6)
 		SmartAvgs(SSE2, SSE3, [esi + 2], [esi + 2*ebx], SSE7, SSE6)
 __asm	pmaxub		SSE0,				SSE2
 __asm	pminub		SSE1,				SSE3
 		SmartAvgs(SSE2, SSE3, [esi + 1], [esi + 2*ebx + 1], SSE7, SSE6)
 __asm	pmaxub		SSE0,				SSE2
 __asm	pminub		SSE1,				SSE3
 		SmartAvgsW(SSE2, SSE3, [edi + 1], [esi + ebx], [esi + ebx + 2], SSE7, SSE6, SSE_MOVE)
 #ifdef		MODIFYPLUGIN
 __asm	SSE3_MOVE	SSE5,				[esi + ebx + 1]		
 #endif
 __asm	pmaxub		SSE0,				SSE2
 #if		(ISSE > 1) || defined(SHLUR)
 #ifdef	MODIFYPLUGIN
 __asm	SSE3_MOVE	SSE6,				[edi]
 #else
 __asm	SSE3_MOVE	SSE6,				[esi + ebx + 1]	
 #endif
 #endif
 __asm	pminub		SSE1,				SSE3
 #ifdef		MODIFYPLUGIN
 __asm	pmaxub		SSE0,				SSE5
 __asm	pminub		SSE1,				SSE5
 #endif
 #ifdef	SHLUR
 		sharpen(SSE6, SSE1, SSE0, SSE4, SSE5, SSE3, SSE2)
 __asm	add			esi,				eax
 __asm	SSE_MOVE	[edi],				SSE6
 #else
 #if	ISSE > 1
 __asm	pminub		SSE0,				SSE6
 #else
 __asm	pminub		SSE0,				[esi + ebx + 1]	
 #endif
 __asm	pmaxub		SSE0,				SSE1
 __asm	add			esi,				eax
 __asm	SSE_MOVE	[edi],				SSE0
 #endif	// SHLUR
 __asm	add			edi,				dpitch
 __asm	dec			height
 #ifdef	MODIFYPLUGIN
 __asm	mov			ecx,				hblocks
 __asm	jnz			middle_loop
 #else
 __asm	jnz			column_loop
 #endif
 }

 #if	ISSE > 1
 #define	SmartAvgf(avg1, avg2, mem1, mem2, mem3, mem4, reg1, reg2)\
 __asm	SSE3_MOVE	avg1,				mem1		\
 __asm	SSE3_MOVE	avg2,				mem3		\
 __asm	SSE3_MOVE	reg1,				mem2		\
 __asm	SSE3_MOVE	reg2,				mem4		\
 __asm	pavgb		avg1,				reg1		\
 __asm	pavgb		avg2,				reg2
 #else
 #define	SmartAvgf(avg1, avg2, mem1, mem2, mem3, mem4, reg1, reg2)\
 __asm	SSE3_MOVE	avg1,				mem1		\
 __asm	SSE3_MOVE	avg2,				mem3		\
 __asm	pavgb		avg1,				mem2		\
 __asm	pavgb		avg2,				mem4
 #endif

 #ifdef	MODIFYPLUGIN
 #define	SmartAvgfW(avg1, avg2, wmem, mem1, mem2, mem3, mem4, reg1, reg2)\
 		SmartAvgf(avg1, avg2, mem1, mem2, mem3, mem4, reg1, reg2)
 #else
 #if	ISSE > 1
 #define	SmartAvgfW(avg1, avg2, wmem, mem1, mem2, mem3, mem4, reg1, reg2, mwrite)\
 __asm	SSE3_MOVE	avg1,				mem1		\
 __asm	SSE3_MOVE	avg2,				mem3		\
 __asm	SSE3_MOVE	reg1,				mem2		\
 __asm	SSE3_MOVE	reg2,				mem4		\
 __asm	pavgb		avg1,				reg1		\
 __asm	pavgb		avg2,				reg2		\
 __asm	mwrite		wmem,				reg2
 #else
 #define	SmartAvgfW(avg1, avg2, wmem, mem4, mem3, mem2, mem1, reg1, reg2, mwrite)\
 __asm	SSE3_MOVE	avg1,				mem1		\
 __asm	SSE3_MOVE	avg2,				mem3		\
 __asm	mwrite		wmem,				avg1		\
 __asm	pavgb		avg2,				mem4		\
 __asm	pavgb		avg1,				mem2
 #endif
 #endif

 #ifdef	SHARPEN
 void	SmartAvgRGf(BYTE *dp, int dpitch, const BYTE *_sp, int spitch, int hblocks, int remainder, int incpitch, int height, int strength)
 #else
 void	SmartAvgRGf(BYTE *dp, int dpitch, const BYTE *_sp, int spitch, int hblocks, int remainder, int incpitch, int height)
 #endif
 {
 #ifdef	SHARPEN
 __asm	mov			ebx,				strength
 __asm	SSE_RMOVE	SSE4,				rshift[ebx]
 __asm	SSE_RMOVE	SSE7,				shift_mask[ebx]
 #endif
 __asm	mov			eax,				hblocks
 __asm	mov			ebx,				spitch
 #ifdef	MODIFYPLUGIN
 __asm	mov			ecx,				eax
 #endif
 __asm	mov			edx,				remainder
 #if		SSE_INCREMENT == 16
 __asm	add			eax,				eax
 #endif
 __asm	mov			esi,				_sp
 #ifdef	MODIFYPLUGIN
 __asm	lea			eax,				[eax * 8 + edx]
 #else
 __asm	lea			eax,				[eax * 8 + edx + SSE_INCREMENT + 1]
 #endif
 __asm	sub			esi,				ebx
 __asm	sub			dpitch,				eax
 __asm	neg			eax
 __asm	mov			edi,				dp
 #ifdef	MODIFYPLUGIN
 __asm	inc			edi
 __asm	lea			eax,				[ebx + eax]	
 #else
 __asm	lea			eax,				[ebx + eax + 1]	
 __asm	align		16
 __asm	column_loop:
 		SmartAvgf(SSE0, SSE3, [esi], [esi + 2*ebx + 2], [esi + 2], [esi + 2*ebx], SSE5, SSE6)
 __asm	SSE_RMOVE	SSE1,				SSE0		
 __asm	pmaxub		SSE0,				SSE3
 __asm	pminub		SSE1,				SSE3
 		SmartAvgfW(SSE2, SSE3, [edi], [esi + 1], [esi + 2*ebx + 1], [esi + ebx + 2], [esi + ebx], SSE5, SSE6, movd)
 __asm	pmaxub		SSE0,				SSE2
 __asm	pminub		SSE1,				SSE3
 #if		(ISSE > 1) || defined(SHLUR)
 __asm	SSE3_MOVE	SSE6,				[esi + ebx + 1]				
 #endif
 __asm	pmaxub		SSE0,				SSE3
 __asm	pminub		SSE1,				SSE2
 #ifdef	SHLUR
 		sharpen(SSE6, SSE1, SSE0, SSE4, SSE7, SSE2, SSE3)
 __asm	SSE_MOVE	[edi + 1],			SSE6
 #else
 #if	ISSE > 1
 __asm	pminub		SSE0,				SSE6
 #else
 __asm	pminub		SSE0,				[esi + ebx + 1]	
 #endif
 __asm	pmaxub		SSE0,				SSE1
 __asm	SSE_MOVE	[edi + 1],			SSE0
 #endif	// SHLUR
 // now the pixels in the middle
 __asm	add			esi,				SSE_INCREMENT
 __asm	add			edi,				SSE_INCREMENT + 1
 __asm	mov			ecx,				hblocks
 #endif // MODIFYPLUGIN
 __asm	align		16
 __asm	middle_loop:
 		SmartAvgf(SSE0, SSE3, [esi], [esi + 2*ebx + 2], [esi + 2], [esi + 2*ebx], SSE5, SSE6)
 __asm	SSE_RMOVE	SSE1,				SSE0		
 __asm	pmaxub		SSE0,				SSE3
 __asm	pminub		SSE1,				SSE3
 		SmartAvgf(SSE2, SSE3, [esi + 1], [esi + 2*ebx + 1], [esi + ebx], [esi + ebx + 2], SSE5, SSE6)
 __asm	pmaxub		SSE0,				SSE2
 #ifdef		MODIFYPLUGIN
 __asm	SSE3_MOVE	SSE5,				[esi + ebx + 1]		
 #endif
 __asm	pminub		SSE1,				SSE3
 #if		(ISSE > 1) || defined(SHLUR)
 #ifdef		MODIFYPLUGIN
 __asm	SSE3_MOVE	SSE6,				[edi]	
 #else
 __asm	SSE3_MOVE	SSE6,				[esi + ebx + 1]
 #endif
 #endif
 __asm	pmaxub		SSE0,				SSE3
 __asm	pminub		SSE1,				SSE2
 #ifdef		MODIFYPLUGIN
 __asm	pmaxub		SSE0,				SSE5
 __asm	pminub		SSE1,				SSE5
 #endif
 #ifdef	SHLUR
 		sharpen(SSE6, SSE1, SSE0, SSE4, SSE7, SSE2, SSE3)
 __asm	add			esi,				SSE_INCREMENT
 __asm	SSE_MOVE	[edi],				SSE6
 #else
 #if	ISSE > 1
 __asm	pminub		SSE0,				SSE6
 #else
 #ifdef		MODIFYPLUGIN
 __asm	pminub		SSE0,				[edi]	
 #else
 __asm	pminub		SSE0,				[esi + ebx + 1]	
 #endif
 #endif	// SHLUR
 __asm	pmaxub		SSE0,				SSE1
 __asm	add			esi,				SSE_INCREMENT
 __asm	SSE_MOVE	[edi],				SSE0
 #endif
 __asm	add			edi,				SSE_INCREMENT
 __asm	dec			ecx
 __asm	jnz			middle_loop
 // the last pixels
 __asm	add			esi,				edx
 __asm	add			edi,				edx
 		SmartAvgf(SSE0, SSE3, [esi], [esi + 2*ebx + 2], [esi + 2], [esi + 2*ebx], SSE5, SSE6)
 __asm	SSE_RMOVE	SSE1,				SSE0		
 __asm	pmaxub		SSE0,				SSE3
 __asm	pminub		SSE1,				SSE3
 		SmartAvgfW(SSE2, SSE3, [edi + 1], [esi + 1], [esi + 2*ebx + 1], [esi + ebx], [esi + ebx + 2], SSE5, SSE6, SSE_MOVE)
 __asm	pmaxub		SSE0,				SSE2
 #ifdef		MODIFYPLUGIN
 __asm	SSE3_MOVE	SSE5,				[esi + ebx + 1]		
 #endif
 __asm	pminub		SSE1,				SSE3
 #if		(ISSE > 1) || defined(SHLUR)
 #ifdef		MODIFYPLUGIN
 __asm	SSE3_MOVE	SSE6,				[edi]	
 #else
 __asm	SSE3_MOVE	SSE6,				[esi + ebx + 1]	
 #endif
 #endif
 __asm	pmaxub		SSE0,				SSE3
 __asm	pminub		SSE1,				SSE2
 #ifdef		MODIFYPLUGIN
 __asm	pmaxub		SSE0,				SSE5
 __asm	pminub		SSE1,				SSE5
 #endif
 #ifdef	SHLUR
 		sharpen(SSE6, SSE1, SSE0, SSE4, SSE7, SSE2, SSE3)
 __asm	add			esi,				eax
 __asm	SSE_MOVE	[edi],				SSE6
 #else
 #if	ISSE > 1
 __asm	pminub		SSE0,				SSE6
 #else
 #ifdef		MODIFYPLUGIN
 __asm	pminub		SSE0,				[edi]	
 #else
 __asm	pminub		SSE0,				[esi + ebx + 1]
 #endif
 #endif
 __asm	pmaxub		SSE0,				SSE1
 __asm	add			esi,				eax
 __asm	SSE_MOVE	[edi],				SSE0
 #endif	// SHLUR
 __asm	add			edi,				dpitch
 __asm	dec			height
 #ifdef	MODIFYPLUGIN
 __asm	mov			ecx,				hblocks
 __asm	jnz			middle_loop
 #else
 __asm	jnz			column_loop
 #endif
 }
 #endif	// #ifndef	MODIFYPLUGIN

 #ifdef	MODIFYPLUGIN

 void	SSE_Repair12(BYTE *dp, int dpitch, const BYTE *_sp, int spitch, int hblocks, int remainder, int incpitch, int height)
 {
 __asm	mov			eax,				hblocks
 __asm	mov			ebx,				spitch
 #ifdef	MODIFYPLUGIN
 __asm	mov			ecx,				eax
 #endif
 __asm	mov			edx,				remainder
 #if		SSE_INCREMENT == 16
 __asm	add			eax,				eax
 #endif
 __asm	mov			esi,				_sp
 #ifdef	MODIFYPLUGIN
 __asm	lea			eax,				[eax * 8 + edx]
 #else
 __asm	lea			eax,				[eax * 8 + edx + SSE_INCREMENT + 1]
 #endif
 __asm	sub			esi,				ebx
 __asm	sub			dpitch,				eax
 __asm	neg			eax
 __asm	mov			edi,				dp
 #ifdef	MODIFYPLUGIN
 __asm	inc			edi
 __asm	lea			eax,				[ebx + eax]	
 #else
 __asm	lea			eax,				[ebx + eax + 1]	
 __asm	align		16
 __asm	column_loop:
 __asm	SSE3_MOVE	SSE0,				[esi]
 __asm	SSE3_MOVE	SSE7,				[esi + 1]
 		add2(SSE0, SSE1, SSE7)
 __asm	SSE3_MOVE	SSE6,				[esi + 2]
 __asm	SSE3_MOVE	SSE7,				[esi + ebx]
 		add3(SSE0, SSE1, SSE2, SSE6)
 __asm	movd		[edi],				SSE7
 __asm	SSE3_MOVE	SSE6,				[esi + ebx + 2]
 		add4(SSE0, SSE1, SSE2, SSE3, SSE7)
 __asm	SSE3_MOVE	SSE5,				[esi + 2*ebx]
 		minmax2(SSE0, SSE1, SSE2, SSE3, SSE6)
 __asm	SSE3_MOVE	SSE7,				[esi + 2*ebx + 1]
 		minmax2(SSE0, SSE1, SSE2, SSE3, SSE5)
 __asm	SSE3_MOVE	SSE6,				[esi + 2*ebx + 2]
 		minmax2(SSE0, SSE1, SSE2, SSE3, SSE7)
 #if		ISSE > 1
 __asm	SSE3_MOVE	SSE5,				[esi + ebx + 1]				
 #endif
 		minmax2sub(SSE0, SSE1, SSE2, SSE3, SSE6)
 #if		ISSE > 1
 __asm	pmaxub		SSE2,				SSE5
 #else
 __asm	pmaxub		SSE2,				[esi + ebx + 1]	
 #endif
 __asm	pminub		SSE1,				SSE2
 __asm	SSE_MOVE	[edi + 1],			SSE1
 // now the pixels in the middle
 __asm	add			esi,				SSE_INCREMENT
 __asm	add			edi,				SSE_INCREMENT + 1
 __asm	mov			ecx,				hblocks
 #endif // MODIFYPLUGIN
 __asm	align		16
 __asm	middle_loop:
 __asm	SSE3_MOVE	SSE0,				[esi]
 __asm	SSE3_MOVE	SSE7,				[esi + 1]
 		add2(SSE0, SSE1, SSE7)
 __asm	SSE3_MOVE	SSE6,				[esi + 2]
 __asm	SSE3_MOVE	SSE7,				[esi + ebx]
 		add3(SSE0, SSE1, SSE2, SSE6)
 __asm	SSE3_MOVE	SSE6,				[esi + ebx + 2]
 		add4(SSE0, SSE1, SSE2, SSE3, SSE7)
 __asm	SSE3_MOVE	SSE5,				[esi + 2*ebx]
 		minmax2(SSE0, SSE1, SSE2, SSE3, SSE6)
 __asm	SSE3_MOVE	SSE7,				[esi + 2*ebx + 1]
 		minmax2(SSE0, SSE1, SSE2, SSE3, SSE5)
 __asm	SSE3_MOVE	SSE6,				[esi + 2*ebx + 2]
 		minmax2(SSE0, SSE1, SSE2, SSE3, SSE7)
 #ifdef		MODIFYPLUGIN
 __asm	SSE3_MOVE	SSE4,				[esi + ebx + 1]		
 #endif
 #if		ISSE > 1
 #ifdef	MODIFYPLUGIN
 __asm	SSE3_MOVE	SSE5,				[edi]
 #else
 __asm	SSE3_MOVE	SSE5,				[esi + ebx + 1]	
 #endif
 #endif
 		minmax2sub(SSE0, SSE1, SSE2, SSE3, SSE6)
 #ifdef	MODIFYPLUGIN
 __asm	pminub		SSE2,				SSE4
 __asm	pmaxub		SSE1,				SSE4
 #endif
 #if		ISSE > 1
 __asm	pmaxub		SSE2,				SSE5
 #else
 #ifdef	MODIFYPLUGIN
 __asm	pmaxub		SSE2,				[edi]
 #else
 __asm	pmaxub		SSE2,				[esi + ebx + 1]	
 #endif
 #endif	// ISSE > 1
 __asm	pminub		SSE1,				SSE2
 __asm	add			esi,				SSE_INCREMENT
 __asm	SSE_MOVE	[edi],				SSE1
 __asm	add			edi,				SSE_INCREMENT
 __asm	dec			ecx
 __asm	jnz			middle_loop
 // the last pixels
 __asm	add			esi,				edx
 __asm	add			edi,				edx
 __asm	SSE3_MOVE	SSE0,				[esi]
 __asm	SSE3_MOVE	SSE7,				[esi + 1]
 		add2(SSE0, SSE1, SSE7)
 __asm	SSE3_MOVE	SSE6,				[esi + 2]
 __asm	SSE3_MOVE	SSE7,				[esi + ebx]
 		add3(SSE0, SSE1, SSE2, SSE6)
 __asm	SSE3_MOVE	SSE6,				[esi + ebx + 2]
 		add4(SSE0, SSE1, SSE2, SSE3, SSE7)
 #ifndef	MODIFYPLUGIN
 __asm	SSE_MOVE	[edi + 1],			SSE6
 #endif
 __asm	SSE3_MOVE	SSE5,				[esi + 2*ebx]
 		minmax2(SSE0, SSE1, SSE2, SSE3, SSE6)
 __asm	SSE3_MOVE	SSE7,				[esi + 2*ebx + 1]
 		minmax2(SSE0, SSE1, SSE2, SSE3, SSE5)
 __asm	SSE3_MOVE	SSE6,				[esi + 2*ebx + 2]
 		minmax2(SSE0, SSE1, SSE2, SSE3, SSE7)
 #ifdef		MODIFYPLUGIN
 __asm	SSE3_MOVE	SSE4,				[esi + ebx + 1]		
 #endif
 #if		ISSE > 1
 #ifdef	MODIFYPLUGIN
 __asm	SSE3_MOVE	SSE5,				[edi]
 #else
 __asm	SSE3_MOVE	SSE5,				[esi + ebx + 1]	
 #endif
 #endif
 		minmax2sub(SSE0, SSE1, SSE2, SSE3, SSE6)
 #ifdef	MODIFYPLUGIN
 __asm	pminub		SSE2,				SSE4
 __asm	pmaxub		SSE1,				SSE4
 #endif
 #if		ISSE > 1
 __asm	pmaxub		SSE2,				SSE5
 #else
 #ifdef	MODIFYPLUGIN
 __asm	pmaxub		SSE2,				[edi]
 #else
 __asm	pmaxub		SSE2,				[esi + ebx + 1]	
 #endif
 #endif	// ISSE > 1
 __asm	pminub		SSE1,				SSE2
 __asm	add			esi,				eax
 __asm	SSE_MOVE	[edi],				SSE1
 __asm	add			edi,				dpitch
 __asm	dec			height
 #ifdef	MODIFYPLUGIN
 __asm	mov			ecx,				hblocks
 __asm	jnz			middle_loop
 #else
 __asm	jnz			column_loop
 #endif
 }

 void	SSE_Repair13(BYTE *dp, int dpitch, const BYTE *_sp, int spitch, int hblocks, int remainder, int incpitch, int height)
 { 
 __asm	mov			eax,				hblocks
 __asm	mov			ebx,				spitch
 #ifdef	MODIFYPLUGIN
 __asm	mov			ecx,				eax
 #endif
 __asm	mov			edx,				remainder
 #if		SSE_INCREMENT == 16
 __asm	add			eax,				eax
 #endif
 __asm	mov			esi,				_sp
 #ifdef	MODIFYPLUGIN
 __asm	lea			eax,				[eax * 8 + edx]
 #else
 __asm	lea			eax,				[eax * 8 + edx + SSE_INCREMENT + 1]
 #endif
 __asm	sub			esi,				ebx
 __asm	sub			dpitch,				eax
 __asm	neg			eax
 __asm	mov			edi,				dp
 #ifdef	MODIFYPLUGIN
 __asm	inc			edi
 __asm	lea			eax,				[ebx + eax]	
 #else
 __asm	lea			eax,				[ebx + eax + 1]	
 __asm	align		16
 __asm	column_loop:
 __asm	SSE3_MOVE	SSE0,				[esi]
 __asm	SSE3_MOVE	SSE7,				[esi + 1]
 		add2(SSE0, SSE1, SSE7)
 __asm	SSE3_MOVE	SSE6,				[esi + 2]
 __asm	SSE3_MOVE	SSE5,				[esi + ebx]
 		add3(SSE0, SSE1, SSE2, SSE6)
 __asm	SSE3_MOVE	SSE7,				[esi + ebx + 2]
 		add4(SSE0, SSE1, SSE2, SSE3, SSE5)
 __asm	movd		[edi],				SSE5
 __asm	SSE3_MOVE	SSE6,				[esi + 2*ebx]				
 		add5(SSE0, SSE1, SSE2, SSE3, SSE4, SSE7)
 __asm	SSE3_MOVE	SSE7,				[esi + 2*ebx + 1]
 		add6(SSE0, SSE1, SSE2, SSE3, SSE4, SSE5, SSE6)
 __asm	SSE3_MOVE	SSE6,				[esi + 2*ebx + 2]
 		minmax3sub(SSE0, SSE1, SSE2, SSE3, SSE4, SSE5, SSE7)
 #if		ISSE > 1
 __asm	SSE3_MOVE	SSE7,				[esi + ebx + 1]
 #endif
 		minmax2sub(SSE1, SSE2, SSE3, SSE4, SSE6)
 #if		ISSE > 1
 __asm	pmaxub		SSE3,				SSE7
 #else
 __asm	pmaxub		SSE3,				[esi + ebx + 1]
 #endif
 __asm	pminub		SSE3,				SSE2
 __asm	SSE_MOVE	[edi + 1],			SSE3
 // now the pixels in the middle
 __asm	add			esi,				SSE_INCREMENT
 __asm	add			edi,				SSE_INCREMENT + 1
 __asm	mov			ecx,				hblocks
 #endif // MODIFYPLUGIN
 __asm	align		16
 __asm	middle_loop:
 __asm	SSE3_MOVE	SSE0,				[esi]
 __asm	SSE3_MOVE	SSE7,				[esi + 1]
 		add2(SSE0, SSE1, SSE7)
 __asm	SSE3_MOVE	SSE6,				[esi + 2]
 __asm	SSE3_MOVE	SSE5,				[esi + ebx]
 		add3(SSE0, SSE1, SSE2, SSE6)
 __asm	SSE3_MOVE	SSE7,				[esi + ebx + 2]
 		add4(SSE0, SSE1, SSE2, SSE3, SSE5)
 __asm	SSE3_MOVE	SSE6,				[esi + 2*ebx]				
 		add5(SSE0, SSE1, SSE2, SSE3, SSE4, SSE7)
 __asm	SSE3_MOVE	SSE7,				[esi + 2*ebx + 1]
 		add6(SSE0, SSE1, SSE2, SSE3, SSE4, SSE5, SSE6)
 __asm	SSE3_MOVE	SSE6,				[esi + 2*ebx + 2]
 		minmax3sub(SSE0, SSE1, SSE2, SSE3, SSE4, SSE5, SSE7)
 #ifdef		MODIFYPLUGIN
 __asm	SSE3_MOVE	SSE5,				[esi + ebx + 1]		
 #endif
 #if		ISSE > 1
 #ifdef		MODIFYPLUGIN
 __asm	SSE3_MOVE	SSE7,				[edi]
 #else
 __asm	SSE3_MOVE	SSE7,				[esi + ebx + 1]
 #endif
 #endif
 		minmax2sub(SSE1, SSE2, SSE3, SSE4, SSE6)
 #ifdef	MODIFYPLUGIN
 __asm	pminub		SSE3,				SSE5
 __asm	pmaxub		SSE2,				SSE5
 #endif
 #if		ISSE > 1
 __asm	pmaxub		SSE3,				SSE7
 #else
 #ifdef		MODIFYPLUGIN
 __asm	pmaxub		SSE3,				[edi]
 #else
 __asm	pmaxub		SSE3,				[esi + ebx + 1]
 #endif
 #endif	// ISSE > 1
 __asm	pminub		SSE3,				SSE2
 __asm	add			esi,				SSE_INCREMENT
 __asm	SSE_MOVE	[edi],				SSE3	
 __asm	add			edi,				SSE_INCREMENT
 __asm	dec			ecx
 __asm	jnz			middle_loop
 // the last pixels
 __asm	add			esi,				edx
 __asm	add			edi,				edx
 __asm	SSE3_MOVE	SSE0,				[esi]
 __asm	SSE3_MOVE	SSE7,				[esi + 1]
 		add2(SSE0, SSE1, SSE7)
 __asm	SSE3_MOVE	SSE6,				[esi + 2]
 __asm	SSE3_MOVE	SSE5,				[esi + ebx]
 		add3(SSE0, SSE1, SSE2, SSE6)
 __asm	SSE3_MOVE	SSE7,				[esi + ebx + 2]
 		add4(SSE0, SSE1, SSE2, SSE3, SSE5)
 #ifndef	MODIFYPLUGIN
 __asm	SSE_MOVE	[edi + 1],			SSE7
 #endif
 __asm	SSE3_MOVE	SSE6,				[esi + 2*ebx]				
 		add5(SSE0, SSE1, SSE2, SSE3, SSE4, SSE7)
 __asm	SSE3_MOVE	SSE7,				[esi + 2*ebx + 1]
 		add6(SSE0, SSE1, SSE2, SSE3, SSE4, SSE5, SSE6)
 __asm	SSE3_MOVE	SSE6,				[esi + 2*ebx + 2]
 		minmax3sub(SSE0, SSE1, SSE2, SSE3, SSE4, SSE5, SSE7)
 #ifdef		MODIFYPLUGIN
 __asm	SSE3_MOVE	SSE5,				[esi + ebx + 1]		
 #endif
 #if		ISSE > 1
 #ifdef		MODIFYPLUGIN
 __asm	SSE3_MOVE	SSE7,				[edi]
 #else
 __asm	SSE3_MOVE	SSE7,				[esi + ebx + 1]
 #endif
 #endif	// ISSE > 1
 		minmax2sub(SSE1, SSE2, SSE3, SSE4, SSE6)
 #ifdef	MODIFYPLUGIN
 __asm	pminub		SSE3,				SSE5
 __asm	pmaxub		SSE2,				SSE5
 #endif
 #if		ISSE > 1
 __asm	pmaxub		SSE3,				SSE7
 #else
 #ifdef		MODIFYPLUGIN
 __asm	pmaxub		SSE3,				[edi]
 #else
 __asm	pmaxub		SSE3,				[esi + ebx + 1]
 #endif
 #endif	// ISSE > 1
 __asm	pminub		SSE3,				SSE2
 __asm	add			esi,				eax
 __asm	SSE_MOVE	[edi],				SSE3
 __asm	add			edi,				dpitch
 __asm	dec			height
 #ifdef	MODIFYPLUGIN
 __asm	mov			ecx,				hblocks
 __asm	jnz			middle_loop
 #else
 __asm	jnz			column_loop
 #endif
 }

 void	SSE_Repair14(BYTE *dp, int dpitch, const BYTE *_sp, int spitch, int hblocks, int remainder, int incpitch, int height)
 {	
 __asm	mov			eax,				hblocks
 __asm	mov			ebx,				spitch
 #ifdef	MODIFYPLUGIN
 __asm	mov			ecx,				eax
 #endif
 __asm	mov			edx,				remainder
 #if		SSE_INCREMENT == 16
 __asm	add			eax,				eax
 #endif
 __asm	mov			esi,				_sp
 #ifdef	MODIFYPLUGIN
 __asm	lea			eax,				[eax * 8 + edx]
 #else
 __asm	lea			eax,				[eax * 8 + edx + SSE_INCREMENT + 1]
 #endif
 __asm	sub			esi,				ebx
 __asm	sub			dpitch,				eax
 __asm	neg			eax
 __asm	mov			edi,				dp
 #ifdef	MODIFYPLUGIN
 __asm	inc			edi
 __asm	lea			eax,				[ebx + eax]	
 #else
 __asm	lea			eax,				[ebx + eax + 1]	
 __asm	align		16
 __asm	column_loop:
 __asm	SSE3_MOVE	SSE0,				[esi]
 __asm	SSE3_MOVE	SSE7,				[esi + 1]
 		add2(SSE0, SSE1, SSE7)
 __asm	SSE3_MOVE	SSE6,				[esi + 2]
 __asm	SSE3_MOVE	SSE5,				[esi + ebx]
 		add3(SSE0, SSE1, SSE2, SSE6)
 __asm	SSE3_MOVE	SSE7,				[esi + ebx + 2]
 		add4(SSE0, SSE1, SSE2, SSE3, SSE5)
 __asm	movd		[edi],				SSE5
 __asm	SSE3_MOVE	SSE6,				[esi + 2*ebx]				
 		add5(SSE0, SSE1, SSE2, SSE3, SSE4, SSE7)
 __asm	SSE3_MOVE	SSE5,				[esi + 2*ebx + 1]				
 		sub5(SSE0, SSE1, SSE2, SSE3, SSE4, SSE6)
 __asm	SSE3_MOVE	SSE7,				[esi + 2*ebx + 2]				
 		sub4(SSE1, SSE2, SSE3, SSE4, SSE5)
 #if		ISSE > 1
 __asm	SSE3_MOVE	SSE5,				[esi + ebx + 1]
 #endif
 		sub3(SSE2, SSE3, SSE4, SSE7)
 #if		ISSE > 1
 __asm	pmaxub		SSE4,				SSE5
 #else
 __asm	pmaxub		SSE4,				[esi + ebx + 1]
 #endif
 __asm	pminub		SSE3,				SSE4
 __asm	SSE_MOVE	[edi + 1],			SSE3
 // now the pixels in the middle
 __asm	add			esi,				SSE_INCREMENT
 __asm	add			edi,				SSE_INCREMENT + 1
 __asm	mov			ecx,				hblocks
 #endif // MODIFYPLUGIN
 __asm	align		16
 __asm	middle_loop:
 __asm	SSE3_MOVE	SSE0,				[esi]
 __asm	SSE3_MOVE	SSE7,				[esi + 1]
 		add2(SSE0, SSE1, SSE7)
 __asm	SSE3_MOVE	SSE6,				[esi + 2]
 __asm	SSE3_MOVE	SSE5,				[esi + ebx]
 		add3(SSE0, SSE1, SSE2, SSE6)
 __asm	SSE3_MOVE	SSE7,				[esi + ebx + 2]
 		add4(SSE0, SSE1, SSE2, SSE3, SSE5)
 __asm	SSE3_MOVE	SSE6,				[esi + 2*ebx]				
 		add5(SSE0, SSE1, SSE2, SSE3, SSE4, SSE7)
 __asm	SSE3_MOVE	SSE5,				[esi + 2*ebx + 1]				
 		sub5(SSE0, SSE1, SSE2, SSE3, SSE4, SSE6)
 __asm	SSE3_MOVE	SSE7,				[esi + 2*ebx + 2]				
 		sub4(SSE1, SSE2, SSE3, SSE4, SSE5)
 #ifdef	MODIFYPLUGIN
 __asm	SSE3_MOVE	SSE0,				[esi + ebx + 1]				
 #endif	// MODIFYPLUGIN
 #if		ISSE > 1
 #ifdef	MODIFYPLUGIN
 __asm	SSE3_MOVE	SSE5,				[edi]
 #else
 __asm	SSE3_MOVE	SSE5,				[esi + ebx + 1]
 #endif
 #endif
 		sub3(SSE2, SSE3, SSE4, SSE7)
 #ifdef	MODIFYPLUGIN
 __asm	pminub		SSE4,				SSE0
 __asm	pmaxub		SSE3,				SSE0
 #endif	// MODIFYPLUGIN
 #if		ISSE > 1
 __asm	pmaxub		SSE4,				SSE5
 #else
 #ifdef	MODIFYPLUGIN
 __asm	pmaxub		SSE4,				[edi]
 #else	// ISSE > 1
 __asm	pmaxub		SSE4,				[esi + ebx + 1]
 #endif
 #endif	// ISSE > 1
 __asm	add			esi,				SSE_INCREMENT
 __asm	pminub		SSE3,				SSE4
 __asm	SSE_MOVE	[edi],				SSE3
 __asm	add			edi,				SSE_INCREMENT
 __asm	dec			ecx
 __asm	jnz			middle_loop
 // the last pixels
 __asm	add			esi,				edx
 __asm	add			edi,				edx
 __asm	SSE3_MOVE	SSE0,				[esi]
 __asm	SSE3_MOVE	SSE7,				[esi + 1]
 		add2(SSE0, SSE1, SSE7)
 __asm	SSE3_MOVE	SSE6,				[esi + 2]
 __asm	SSE3_MOVE	SSE5,				[esi + ebx]
 		add3(SSE0, SSE1, SSE2, SSE6)
 __asm	SSE3_MOVE	SSE7,				[esi + ebx + 2]
 		add4(SSE0, SSE1, SSE2, SSE3, SSE5)
 #ifndef		MODIFYPLUGIN
 __asm	SSE_MOVE	[edi + 1],			SSE7
 #endif
 __asm	SSE3_MOVE	SSE6,				[esi + 2*ebx]				
 		add5(SSE0, SSE1, SSE2, SSE3, SSE4, SSE7)
 __asm	SSE3_MOVE	SSE5,				[esi + 2*ebx + 1]				
 		sub5(SSE0, SSE1, SSE2, SSE3, SSE4, SSE6)
 __asm	SSE3_MOVE	SSE7,				[esi + 2*ebx + 2]				
 		sub4(SSE1, SSE2, SSE3, SSE4, SSE5)
 #ifdef	MODIFYPLUGIN
 __asm	SSE3_MOVE	SSE0,				[esi + ebx + 1]				
 #endif
 #if		ISSE > 1
 #ifdef	MODIFYPLUGIN
 __asm	SSE3_MOVE	SSE5,				[edi]
 #else
 __asm	SSE3_MOVE	SSE5,				[esi + ebx + 1]
 #endif
 #endif
 		sub3(SSE2, SSE3, SSE4, SSE7)
 #ifdef	MODIFYPLUGIN
 __asm	pminub		SSE4,				SSE0
 __asm	pmaxub		SSE3,				SSE0
 #endif	
 #if		ISSE > 1
 __asm	pmaxub		SSE4,				SSE5
 #else
 #ifdef	MODIFYPLUGIN
 __asm	pmaxub		SSE4,				[edi]
 #else	// ISSE > 1
 __asm	pmaxub		SSE4,				[esi + ebx + 1]
 #endif
 #endif	// ISSE > 1
 __asm	pminub		SSE3,				SSE4
 __asm	add			esi,				eax
 __asm	SSE_MOVE	[edi],				SSE3
 __asm	add			edi,				dpitch
 __asm	dec			height
 #ifdef	MODIFYPLUGIN
 __asm	mov			ecx,				hblocks
 __asm	jnz			middle_loop
 #else
 __asm	jnz			column_loop
 #endif
 }

 #endif	// MODIFYPLUGIN	

 #if	defined(MODIFYPLUGIN) || defined(SHLUR)

 #define		diagweightr5(min, max, weight, center, bound1, bound2, reg)	\
 __asm	SSE3_MOVE	min,				bound1			\
 __asm	SSE3_MOVE	reg,				bound2			\
 __asm	SSE_RMOVE	max,				min				\
 __asm	SSE_RMOVE	weight,				center			\
 __asm	pminub		min,				reg				\
 __asm	pmaxub		max,				reg				\
 __asm	SSE_RMOVE	reg,				min				\
 __asm	psubusb		weight,				max				\
 __asm	psubusb		reg,				center			\
 __asm	pmaxub		weight,				reg

 #if	defined(MODIFYPLUGIN)
 #define		diagweightwr5(min, max, weight, center, bound1, bound2, wmem, reg)	diagweightr5(min, max, weight, center, bound1, bound2, reg)
 #else
 // same as diagweight_5, but in addition bound2 is written to wmem
 #define		diagweightwr5(min, max, weight, center, bound1, bound2, wmem, reg)	\
 __asm	SSE3_MOVE	min,				bound1			\
 __asm	SSE3_MOVE	reg,				bound2			\
 __asm	SSE_RMOVE	max,				min				\
 __asm	SSE_MOVE	wmem,				reg				\
 __asm	SSE_RMOVE	weight,				center			\
 __asm	pminub		min,				reg				\
 __asm	pmaxub		max,				reg				\
 __asm	SSE_RMOVE	reg,				min				\
 __asm	psubusb		weight,				max				\
 __asm	psubusb		reg,				center			\
 __asm	pmaxub		weight,				reg

 #endif	// MODIFYPLUGIN
 #ifdef	SHARPEN
 void	SSE_Repair15(BYTE *dp, int dpitch, const BYTE *_sp, int spitch, int hblocks, int remainder, int incpitch, int height, int strength)
 #else
 void	SSE_Repair15(BYTE *dp, int dpitch, const BYTE *_sp, int spitch, int hblocks, int remainder, int incpitch, int height)
 #endif
 {
 __asm	mov			eax,				hblocks
 __asm	mov			ebx,				spitch
 #ifdef	MODIFYPLUGIN
 __asm	mov			ecx,				eax
 #endif
 __asm	mov			edx,				remainder
 #if		SSE_INCREMENT == 16
 __asm	add			eax,				eax
 #endif
 __asm	mov			esi,				_sp
 #ifdef	MODIFYPLUGIN
 __asm	lea			eax,				[eax * 8 + edx]
 #else
 __asm	lea			eax,				[eax * 8 + edx + SSE_INCREMENT + 1]
 #endif
 __asm	sub			esi,				ebx
 __asm	sub			dpitch,				eax
 __asm	neg			eax
 __asm	mov			edi,				dp
 #ifdef	MODIFYPLUGIN
 __asm	inc			edi
 __asm	lea			eax,				[ebx + eax]	
 #else
 __asm	lea			eax,				[ebx + eax + 1]	
 #ifdef	SHARPEN
 __asm	mov			spitch,				eax
 __asm	mov			eax,				strength
 #endif
 __asm	align		16
 __asm	column_loop:
 __asm	SSE3_MOVE	SSE0,				[esi + ebx + 1]		
 		diagweightr5(SSE1, SSE2, SSE3, SSE0, [esi], [esi + 2*ebx + 2], SSE7)
 		diagweightr5(SSE4, SSE5, SSE6, SSE0, [esi + 2*ebx], [esi + 2], SSE7)
 		merge2weighted(SSE1, SSE2, SSE3, SSE4, SSE5, SSE6)
 		diagweightr5(SSE4, SSE5, SSE6, SSE0, [esi + 2*ebx + 1], [esi + 1], SSE7)
 		merge2weighted(SSE1, SSE2, SSE3, SSE4, SSE5, SSE6)
 		diagweightwr5(SSE4, SSE5, SSE6, SSE0, [esi + ebx + 2], [esi + ebx], [edi], SSE7)
 		merge2weighted(SSE1, SSE2, SSE3, SSE4, SSE5, SSE6)
 #ifdef	SHLUR
 		sharpen(SSE0, SSE1, SSE2, rshift[eax], shift_mask[eax], SSE7, SSE3)
 __asm	SSE_MOVE	[edi + 1],			SSE0
 #else
 __asm	pmaxub		SSE1,				SSE0
 __asm	pminub		SSE1,				SSE2
 __asm	SSE_MOVE	[edi + 1],			SSE1
 #endif
 // now the pixels in the middle
 __asm	add			esi,				SSE_INCREMENT
 __asm	add			edi,				SSE_INCREMENT + 1
 __asm	mov			ecx,				hblocks
 #endif	// MODIFYPLUGIN
 __asm	align		16
 __asm	middle_loop:
 __asm	SSE3_MOVE	SSE0,				[esi + ebx + 1]
 		diagweightr5(SSE1, SSE2, SSE3, SSE0, [esi], [esi + 2*ebx + 2], SSE7)
 		diagweightr5(SSE4, SSE5, SSE6, SSE0, [esi + 2*ebx], [esi + 2], SSE7)
 		merge2weighted(SSE1, SSE2, SSE3, SSE4, SSE5, SSE6)
 		diagweightr5(SSE4, SSE5, SSE6, SSE0, [esi + 2*ebx + 1], [esi + 1], SSE7)
 		merge2weighted(SSE1, SSE2, SSE3, SSE4, SSE5, SSE6)
 		diagweightr5(SSE4, SSE5, SSE6, SSE0, [esi + ebx + 2], [esi + ebx], SSE7)
 		merge2weighted(SSE1, SSE2, SSE3, SSE4, SSE5, SSE6)
 __asm	add			esi,				SSE_INCREMENT
 #ifdef	MODIFYPLUGIN
 #if		ISSE > 1
 __asm	SSE3_MOVE	SSE7,				[edi]
 #endif
 #if	MODIFYPLUGIN > 0
 __asm	pminub		SSE1,				SSE0
 __asm	pmaxub		SSE2,				SSE0
 #endif
 #if		ISSE > 1
 __asm	pmaxub		SSE1,				SSE7
 #else
 __asm	pmaxub		SSE1,				[edi]
 #endif
 __asm	pminub		SSE1,				SSE2
 __asm	SSE_MOVE	[edi],				SSE1
 #else	// MODIFYPLUGIN
 #ifdef	SHLUR
 		sharpen(SSE0, SSE1, SSE2, rshift[eax], shift_mask[eax], SSE7, SSE3)
 __asm	SSE_MOVE	[edi],				SSE0
 #else
 __asm	pmaxub		SSE1,				SSE0
 __asm	pminub		SSE1,				SSE2
 __asm	SSE_MOVE	[edi],				SSE1
 #endif
 #endif	// MODIFYPLUGIN
 __asm	add			edi,				SSE_INCREMENT
 __asm	dec			ecx
 __asm	jnz			middle_loop
 // the last pixels
 __asm	add			esi,				edx
 __asm	add			edi,				edx
 __asm	SSE3_MOVE	SSE0,				[esi + ebx + 1]		
 		diagweightr5(SSE1, SSE2, SSE3, SSE0, [esi], [esi + 2*ebx + 2], SSE7)
 		diagweightr5(SSE4, SSE5, SSE6, SSE0, [esi + 2*ebx], [esi + 2], SSE7)
 		merge2weighted(SSE1, SSE2, SSE3, SSE4, SSE5, SSE6)
 		diagweightr5(SSE4, SSE5, SSE6, SSE0, [esi + 2*ebx + 1], [esi + 1], SSE7)
 		merge2weighted(SSE1, SSE2, SSE3, SSE4, SSE5, SSE6)
 		diagweightwr5(SSE4, SSE5, SSE6, SSE0, [esi + ebx], [esi + ebx + 2], [edi + 1], SSE7)
 		merge2weighted(SSE1, SSE2, SSE3, SSE4, SSE5, SSE6)
 #ifdef	MODIFYPLUGIN
 #if		ISSE > 1
 __asm	SSE3_MOVE	SSE7,				[edi]
 #endif
 #if	MODIFYPLUGIN > 0
 __asm	pminub		SSE1,				SSE0
 __asm	pmaxub		SSE2,				SSE0
 #endif
 #if		ISSE > 1
 __asm	pmaxub		SSE1,				SSE7
 #else
 __asm	pmaxub		SSE1,				[edi]
 #endif
 __asm	pminub		SSE1,				SSE2	
 __asm	SSE_MOVE	[edi],				SSE1
 #else	// MODIFYPLUGIN
 #ifdef	SHLUR
 		sharpen(SSE0, SSE1, SSE2, rshift[eax], shift_mask[eax], SSE7, SSE3)
 __asm	SSE_MOVE	[edi],				SSE0
 #else
 __asm	pmaxub		SSE1,				SSE0
 __asm	pminub		SSE1,				SSE2	
 __asm	SSE_MOVE	[edi],				SSE1
 #endif
 #endif	// MODIFYPLUGIN
 #ifdef	SHARPEN
 __asm	add			esi,				spitch
 #else
 __asm	add			esi,				eax
 #endif
 __asm	add			edi,				dpitch
 __asm	dec			height
 #ifdef	MODIFYPLUGIN
 __asm	mov			ecx,				hblocks
 __asm	jnz			middle_loop
 #else
 __asm	jnz			column_loop
 #endif
 }

 #define	diagweightr6(min, max, weight, center, bound1, bound2, reg)	\
 __asm	SSE3_MOVE	min,				bound1			\
 __asm	SSE3_MOVE	reg,				bound2			\
 __asm	SSE_RMOVE	max,				min				\
 __asm	SSE_RMOVE	weight,				center			\
 __asm	pminub		min,				reg				\
 __asm	pmaxub		max,				reg				\
 __asm	SSE_RMOVE	reg,				min				\
 __asm	psubusb		weight,				max				\
 __asm	psubusb		reg,				center			\
 __asm	pmaxub		weight,				reg				\
 __asm	SSE_RMOVE	reg,				max				\
 __asm	paddusb		weight,				weight			\
 __asm	psubusb		reg,				min				\
 __asm	paddusb		weight,				reg

 #ifdef	MODIFYPLUGIN
 #define		diagweightwr6(min, max, weight, center, bound1, bound2, wmem, reg)	diagweightr6(min, max, weight, center, bound1, bound2, reg)
 #else
 // same as diagweight_5, but in addition bound2 is written to wmem
 #define		diagweightwr6(min, max, weight, center, bound1, bound2, wmem, reg)	\
 __asm	SSE3_MOVE	min,				bound1			\
 __asm	SSE3_MOVE	reg,				bound2			\
 __asm	SSE_RMOVE	max,				min				\
 __asm	SSE_MOVE	wmem,				reg				\
 __asm	SSE_RMOVE	weight,				center			\
 __asm	pminub		min,				reg				\
 __asm	pmaxub		max,				reg				\
 __asm	SSE_RMOVE	reg,				min				\
 __asm	psubusb		weight,				max				\
 __asm	psubusb		reg,				center			\
 __asm	pmaxub		weight,				reg				\
 __asm	SSE_RMOVE	reg,				max				\
 __asm	paddusb		weight,				weight			\
 __asm	psubusb		reg,				min				\
 __asm	paddusb		weight,				reg
 #endif	// MODIFYPLUGIN

 #ifdef	SHARPEN
 void	SSE_Repair16(BYTE *dp, int dpitch, const BYTE *_sp, int spitch, int hblocks, int remainder, int incpitch, int height, int strength)
 #else
 void	SSE_Repair16(BYTE *dp, int dpitch, const BYTE *_sp, int spitch, int hblocks, int remainder, int incpitch, int height)
 #endif
 {
 __asm	mov			eax,				hblocks
 __asm	mov			ebx,				spitch
 #ifdef	MODIFYPLUGIN
 __asm	mov			ecx,				eax
 #endif
 __asm	mov			edx,				remainder
 #if		SSE_INCREMENT == 16
 __asm	add			eax,				eax
 #endif
 __asm	mov			esi,				_sp
 #ifdef	MODIFYPLUGIN
 __asm	lea			eax,				[eax * 8 + edx]
 #else
 __asm	lea			eax,				[eax * 8 + edx + SSE_INCREMENT + 1]
 #endif
 __asm	sub			esi,				ebx
 __asm	sub			dpitch,				eax
 __asm	neg			eax
 __asm	mov			edi,				dp
 #ifdef	MODIFYPLUGIN
 __asm	inc			edi
 __asm	lea			eax,				[ebx + eax]	
 #else
 __asm	lea			eax,				[ebx + eax + 1]	
 #ifdef	SHARPEN
 __asm	mov			spitch,				eax
 __asm	mov			eax,				strength
 #endif
 __asm	align		16
 __asm	column_loop:
 __asm	SSE3_MOVE	SSE0,				[esi + ebx + 1]		
 		diagweightr6(SSE1, SSE2, SSE3, SSE0, [esi], [esi + 2*ebx + 2], SSE7)
 		diagweightr6(SSE4, SSE5, SSE6, SSE0, [esi + 2*ebx], [esi + 2], SSE7)
 		merge2weighted(SSE1, SSE2, SSE3, SSE4, SSE5, SSE6)
 		diagweightr6(SSE4, SSE5, SSE6, SSE0, [esi + 2*ebx + 1], [esi + 1], SSE7)
 		merge2weighted(SSE1, SSE2, SSE3, SSE4, SSE5, SSE6)
 		diagweightwr6(SSE4, SSE5, SSE6, SSE0, [esi + ebx + 2], [esi + ebx], [edi], SSE7)
 		merge2weighted(SSE1, SSE2, SSE3, SSE4, SSE5, SSE6)
 #ifdef	SHLUR
 		sharpen(SSE0, SSE1, SSE2, rshift[eax], shift_mask[eax], SSE7, SSE3)
 __asm	SSE_MOVE	[edi + 1],			SSE0
 #else
 __asm	pmaxub		SSE1,				SSE0
 __asm	pminub		SSE1,				SSE2
 __asm	SSE_MOVE	[edi + 1],			SSE1
 #endif
 // now the pixels in the middle
 __asm	add			esi,				SSE_INCREMENT
 __asm	add			edi,				SSE_INCREMENT + 1
 __asm	mov			ecx,				hblocks
 #endif	// MODIFYPLUGIN
 __asm	align		16
 __asm	middle_loop:
 __asm	SSE3_MOVE	SSE0,				[esi + ebx + 1]
 		diagweightr6(SSE1, SSE2, SSE3, SSE0, [esi], [esi + 2*ebx + 2], SSE7)
 		diagweightr6(SSE4, SSE5, SSE6, SSE0, [esi + 2*ebx], [esi + 2], SSE7)
 		merge2weighted(SSE1, SSE2, SSE3, SSE4, SSE5, SSE6)
 		diagweightr6(SSE4, SSE5, SSE6, SSE0, [esi + 2*ebx + 1], [esi + 1], SSE7)
 		merge2weighted(SSE1, SSE2, SSE3, SSE4, SSE5, SSE6)
 		diagweightr6(SSE4, SSE5, SSE6, SSE0, [esi + ebx + 2], [esi + ebx], SSE7)
 		merge2weighted(SSE1, SSE2, SSE3, SSE4, SSE5, SSE6)
 __asm	add			esi,				SSE_INCREMENT
 #ifdef	MODIFYPLUGIN
 #if		ISSE > 1
 __asm	SSE3_MOVE	SSE7,				[edi]
 #endif
 #if	MODIFYPLUGIN > 0
 __asm	pminub		SSE1,				SSE0
 __asm	pmaxub		SSE2,				SSE0
 #endif
 #if		ISSE > 1
 __asm	pmaxub		SSE1,				SSE7
 #else
 __asm	pmaxub		SSE1,				[edi]
 #endif
 __asm	pminub		SSE1,				SSE2
 __asm	SSE_MOVE	[edi],				SSE1
 #else	// MODIFYPLUGIN
 #ifdef	SHLUR
 		sharpen(SSE0, SSE1, SSE2, rshift[eax], shift_mask[eax], SSE7, SSE3)
 __asm	SSE_MOVE	[edi],				SSE0
 #else
 __asm	pmaxub		SSE1,				SSE0
 __asm	pminub		SSE1,				SSE2
 __asm	SSE_MOVE	[edi],				SSE1
 #endif
 #endif	// MODIFYPLUGIN
 __asm	add			edi,				SSE_INCREMENT
 __asm	dec			ecx
 __asm	jnz			middle_loop
 // the last pixels
 __asm	add			esi,				edx
 __asm	add			edi,				edx
 __asm	SSE3_MOVE	SSE0,				[esi + ebx + 1]		
 		diagweightr6(SSE1, SSE2, SSE3, SSE0, [esi], [esi + 2*ebx + 2], SSE7)
 		diagweightr6(SSE4, SSE5, SSE6, SSE0, [esi + 2*ebx], [esi + 2], SSE7)
 		merge2weighted(SSE1, SSE2, SSE3, SSE4, SSE5, SSE6)
 		diagweightr6(SSE4, SSE5, SSE6, SSE0, [esi + 2*ebx + 1], [esi + 1], SSE7)
 		merge2weighted(SSE1, SSE2, SSE3, SSE4, SSE5, SSE6)
 		diagweightwr6(SSE4, SSE5, SSE6, SSE0, [esi + ebx], [esi + ebx + 2], [edi + 1], SSE7)
 		merge2weighted(SSE1, SSE2, SSE3, SSE4, SSE5, SSE6)
 #ifdef	MODIFYPLUGIN
 #if		ISSE > 1
 __asm	SSE3_MOVE	SSE7,				[edi]
 #endif
 #if	MODIFYPLUGIN > 0
 __asm	pminub		SSE1,				SSE0
 __asm	pmaxub		SSE2,				SSE0
 #endif
 #if		ISSE > 1
 __asm	pmaxub		SSE1,				SSE7
 #else
 __asm	pmaxub		SSE1,				[edi]
 #endif
 __asm	pminub		SSE1,				SSE2	
 __asm	SSE_MOVE	[edi],				SSE1
 #else	// MODIFYPLUGIN
 #ifdef	SHLUR
 		sharpen(SSE0, SSE1, SSE2, rshift[eax], shift_mask[eax], SSE7, SSE3)
 __asm	SSE_MOVE	[edi],				SSE0
 #else
 __asm	pmaxub		SSE1,				SSE0
 __asm	pminub		SSE1,				SSE2	
 __asm	SSE_MOVE	[edi],				SSE1
 #endif
 #endif	// MODIFYPLUGIN
 #ifdef	SHARPEN
 __asm	add			esi,				spitch
 #else
 __asm	add			esi,				eax
 #endif
 __asm	add			edi,				dpitch
 __asm	dec			height
 #ifdef	MODIFYPLUGIN
 __asm	mov			ecx,				hblocks
 __asm	jnz			middle_loop
 #else
 __asm	jnz			column_loop
 #endif
 }

 #endif	// defined(MODIFYPLUGIN) || defined(SHLUR)

 #ifdef	SHLUR
 #define	diagweightr7(min, max, weight, center, bound1, bound2, reg)	\
 __asm	SSE3_MOVE	min,				bound1			\
 __asm	SSE3_MOVE	reg,				bound2			\
 __asm	SSE_RMOVE	max,				min				\
 __asm	SSE_RMOVE	weight,				center			\
 __asm	pminub		min,				reg				\
 __asm	pmaxub		max,				reg				\
 __asm	SSE_RMOVE	reg,				min				\
 __asm	psubusb		weight,				max				\
 __asm	psubusb		reg,				center			\
 __asm	pmaxub		weight,				reg				\
 __asm	SSE_RMOVE	reg,				max				\
 __asm	psubusb		reg,				min				\
 __asm	paddusb		weight,				reg

 #ifdef	MODIFYPLUGIN
 #define		diagweightwr7(min, max, weight, center, bound1, bound2, wmem, reg)	diagweightr7(min, max, weight, center, bound1, bound2, reg)
 #else
 // same as diagweight_5, but in addition bound2 is written to wmem
 #define		diagweightwr7(min, max, weight, center, bound1, bound2, wmem, reg)	\
 __asm	SSE3_MOVE	min,				bound1			\
 __asm	SSE3_MOVE	reg,				bound2			\
 __asm	SSE_RMOVE	max,				min				\
 __asm	SSE_MOVE	wmem,				reg				\
 __asm	SSE_RMOVE	weight,				center			\
 __asm	pminub		min,				reg				\
 __asm	pmaxub		max,				reg				\
 __asm	SSE_RMOVE	reg,				min				\
 __asm	psubusb		weight,				max				\
 __asm	psubusb		reg,				center			\
 __asm	pmaxub		weight,				reg				\
 __asm	SSE_RMOVE	reg,				max				\
 __asm	psubusb		reg,				min				\
 __asm	paddusb		weight,				reg
 #endif	// MODIFYPLUGIN

 #ifdef	SHARPEN
 void	SSE_Repair17(BYTE *dp, int dpitch, const BYTE *_sp, int spitch, int hblocks, int remainder, int incpitch, int height, int strength)
 #else
 void	SSE_Repair17(BYTE *dp, int dpitch, const BYTE *_sp, int spitch, int hblocks, int remainder, int incpitch, int height)
 #endif
 {
 __asm	mov			eax,				hblocks
 __asm	mov			ebx,				spitch
 #ifdef	MODIFYPLUGIN
 __asm	mov			ecx,				eax
 #endif
 __asm	mov			edx,				remainder
 #if		SSE_INCREMENT == 16
 __asm	add			eax,				eax
 #endif
 __asm	mov			esi,				_sp
 #ifdef	MODIFYPLUGIN
 __asm	lea			eax,				[eax * 8 + edx]
 #else
 __asm	lea			eax,				[eax * 8 + edx + SSE_INCREMENT + 1]
 #endif
 __asm	sub			esi,				ebx
 __asm	sub			dpitch,				eax
 __asm	neg			eax
 __asm	mov			edi,				dp
 #ifdef	MODIFYPLUGIN
 __asm	inc			edi
 __asm	lea			eax,				[ebx + eax]	
 #else
 __asm	lea			eax,				[ebx + eax + 1]	
 #ifdef	SHARPEN
 __asm	mov			spitch,				eax
 __asm	mov			eax,				strength
 #endif
 __asm	align		16
 __asm	column_loop:
 __asm	SSE3_MOVE	SSE0,				[esi + ebx + 1]		
 		diagweightr7(SSE1, SSE2, SSE3, SSE0, [esi], [esi + 2*ebx + 2], SSE7)
 		diagweightr7(SSE4, SSE5, SSE6, SSE0, [esi + 2*ebx], [esi + 2], SSE7)
 		merge2weighted(SSE1, SSE2, SSE3, SSE4, SSE5, SSE6)
 		diagweightr7(SSE4, SSE5, SSE6, SSE0, [esi + 2*ebx + 1], [esi + 1], SSE7)
 		merge2weighted(SSE1, SSE2, SSE3, SSE4, SSE5, SSE6)
 		diagweightwr7(SSE4, SSE5, SSE6, SSE0, [esi + ebx + 2], [esi + ebx], [edi], SSE7)
 		merge2weighted(SSE1, SSE2, SSE3, SSE4, SSE5, SSE6)
 #ifdef	SHLUR
 		sharpen(SSE0, SSE1, SSE2, rshift[eax], shift_mask[eax], SSE7, SSE3)
 __asm	SSE_MOVE	[edi + 1],			SSE0
 #else
 __asm	pmaxub		SSE1,				SSE0
 __asm	pminub		SSE1,				SSE2
 __asm	SSE_MOVE	[edi + 1],			SSE1
 #endif
 // now the pixels in the middle
 __asm	add			esi,				SSE_INCREMENT
 __asm	add			edi,				SSE_INCREMENT + 1
 __asm	mov			ecx,				hblocks
 #endif	// MODIFYPLUGIN
 __asm	align		16
 __asm	middle_loop:
 __asm	SSE3_MOVE	SSE0,				[esi + ebx + 1]
 		diagweightr7(SSE1, SSE2, SSE3, SSE0, [esi], [esi + 2*ebx + 2], SSE7)
 		diagweightr7(SSE4, SSE5, SSE6, SSE0, [esi + 2*ebx], [esi + 2], SSE7)
 		merge2weighted(SSE1, SSE2, SSE3, SSE4, SSE5, SSE6)
 		diagweightr7(SSE4, SSE5, SSE6, SSE0, [esi + 2*ebx + 1], [esi + 1], SSE7)
 		merge2weighted(SSE1, SSE2, SSE3, SSE4, SSE5, SSE6)
 		diagweightr7(SSE4, SSE5, SSE6, SSE0, [esi + ebx + 2], [esi + ebx], SSE7)
 		merge2weighted(SSE1, SSE2, SSE3, SSE4, SSE5, SSE6)
 __asm	add			esi,				SSE_INCREMENT
 #ifdef	MODIFYPLUGIN
 #if		ISSE > 1
 __asm	SSE3_MOVE	SSE7,				[edi]
 #endif
 #if	MODIFYPLUGIN > 0
 __asm	pminub		SSE1,				SSE0
 __asm	pmaxub		SSE2,				SSE0
 #endif
 #if		ISSE > 1
 __asm	pmaxub		SSE1,				SSE7
 #else
 __asm	pmaxub		SSE1,				[edi]
 #endif
 __asm	pminub		SSE1,				SSE2
 __asm	SSE_MOVE	[edi],				SSE1
 #else	// MODIFYPLUGIN
 #ifdef	SHLUR
 		sharpen(SSE0, SSE1, SSE2, rshift[eax], shift_mask[eax], SSE7, SSE3)
 __asm	SSE_MOVE	[edi],				SSE0
 #else
 __asm	pmaxub		SSE1,				SSE0
 __asm	pminub		SSE1,				SSE2
 __asm	SSE_MOVE	[edi],				SSE1
 #endif
 #endif	// MODIFYPLUGIN
 __asm	add			edi,				SSE_INCREMENT
 __asm	dec			ecx
 __asm	jnz			middle_loop
 // the last pixels
 __asm	add			esi,				edx
 __asm	add			edi,				edx
 __asm	SSE3_MOVE	SSE0,				[esi + ebx + 1]		
 		diagweightr7(SSE1, SSE2, SSE3, SSE0, [esi], [esi + 2*ebx + 2], SSE7)
 		diagweightr7(SSE4, SSE5, SSE6, SSE0, [esi + 2*ebx], [esi + 2], SSE7)
 		merge2weighted(SSE1, SSE2, SSE3, SSE4, SSE5, SSE6)
 		diagweightr7(SSE4, SSE5, SSE6, SSE0, [esi + 2*ebx + 1], [esi + 1], SSE7)
 		merge2weighted(SSE1, SSE2, SSE3, SSE4, SSE5, SSE6)
 		diagweightwr7(SSE4, SSE5, SSE6, SSE0, [esi + ebx], [esi + ebx + 2], [edi + 1], SSE7)
 		merge2weighted(SSE1, SSE2, SSE3, SSE4, SSE5, SSE6)
 #ifdef	MODIFYPLUGIN
 #if		ISSE > 1
 __asm	SSE3_MOVE	SSE7,				[edi]
 #endif
 #if	MODIFYPLUGIN > 0
 __asm	pminub		SSE1,				SSE0
 __asm	pmaxub		SSE2,				SSE0
 #endif
 #if		ISSE > 1
 __asm	pmaxub		SSE1,				SSE7
 #else
 __asm	pmaxub		SSE1,				[edi]
 #endif
 __asm	pminub		SSE1,				SSE2	
 __asm	SSE_MOVE	[edi],				SSE1
 #else	// MODIFYPLUGIN
 #ifdef	SHLUR
 		sharpen(SSE0, SSE1, SSE2, rshift[eax], shift_mask[eax], SSE7, SSE3)
 __asm	SSE_MOVE	[edi],				SSE0
 #else
 __asm	pmaxub		SSE1,				SSE0
 __asm	pminub		SSE1,				SSE2	
 __asm	SSE_MOVE	[edi],				SSE1
 #endif
 #endif	// MODIFYPLUGIN
 #ifdef	SHARPEN
 __asm	add			esi,				spitch
 #else
 __asm	add			esi,				eax
 #endif
 __asm	add			edi,				dpitch
 __asm	dec			height
 #ifdef	MODIFYPLUGIN
 __asm	mov			ecx,				hblocks
 __asm	jnz			middle_loop
 #else
 __asm	jnz			column_loop
 #endif
 }

 #define	diagweightr8(min, max, weight, center, bound1, bound2, reg)	\
 __asm	SSE3_MOVE	min,				bound1			\
 __asm	SSE3_MOVE	reg,				bound2			\
 __asm	SSE_RMOVE	max,				min				\
 __asm	SSE_RMOVE	weight,				center			\
 __asm	pminub		min,				reg				\
 __asm	pmaxub		max,				reg				\
 __asm	SSE_RMOVE	reg,				min				\
 __asm	psubusb		weight,				max				\
 __asm	psubusb		reg,				center			\
 __asm	pmaxub		weight,				reg				\
 __asm	SSE_RMOVE	reg,				max				\
 __asm	psubusb		reg,				min				\
 __asm	paddusb		weight,				reg				\
 __asm	paddusb		weight,				reg

 #ifdef	MODIFYPLUGIN
 #define		diagweightwr8(min, max, weight, center, bound1, bound2, wmem, reg)	diagweightr6(min, max, weight, center, bound1, bound2, reg)
 #else
 // same as diagweight_5, but in addition bound2 is written to wmem
 #define		diagweightwr8(min, max, weight, center, bound1, bound2, wmem, reg)	\
 __asm	SSE3_MOVE	min,				bound1			\
 __asm	SSE3_MOVE	reg,				bound2			\
 __asm	SSE_RMOVE	max,				min				\
 __asm	SSE_MOVE	wmem,				reg				\
 __asm	SSE_RMOVE	weight,				center			\
 __asm	pminub		min,				reg				\
 __asm	pmaxub		max,				reg				\
 __asm	SSE_RMOVE	reg,				min				\
 __asm	psubusb		weight,				max				\
 __asm	psubusb		reg,				center			\
 __asm	pmaxub		weight,				reg				\
 __asm	SSE_RMOVE	reg,				max				\
 __asm	psubusb		reg,				min				\
 __asm	paddusb		weight,				reg				\
 __asm	paddusb		weight,				reg
 #endif	// MODIFYPLUGIN

 #ifdef	SHARPEN
 void	SSE_Repair18a(BYTE *dp, int dpitch, const BYTE *_sp, int spitch, int hblocks, int remainder, int incpitch, int height, int strength)
 #else
 void	SSE_Repair18a(BYTE *dp, int dpitch, const BYTE *_sp, int spitch, int hblocks, int remainder, int incpitch, int height)
 #endif
 {
 __asm	mov			eax,				hblocks
 __asm	mov			ebx,				spitch
 #ifdef	MODIFYPLUGIN
 __asm	mov			ecx,				eax
 #endif
 __asm	mov			edx,				remainder
 #if		SSE_INCREMENT == 16
 __asm	add			eax,				eax
 #endif
 __asm	mov			esi,				_sp
 #ifdef	MODIFYPLUGIN
 __asm	lea			eax,				[eax * 8 + edx]
 #else
 __asm	lea			eax,				[eax * 8 + edx + SSE_INCREMENT + 1]
 #endif
 __asm	sub			esi,				ebx
 __asm	sub			dpitch,				eax
 __asm	neg			eax
 __asm	mov			edi,				dp
 #ifdef	MODIFYPLUGIN
 __asm	inc			edi
 __asm	lea			eax,				[ebx + eax]	
 #else
 __asm	lea			eax,				[ebx + eax + 1]	
 #ifdef	SHARPEN
 __asm	mov			spitch,				eax
 __asm	mov			eax,				strength
 #endif
 __asm	align		16
 __asm	column_loop:
 __asm	SSE3_MOVE	SSE0,				[esi + ebx + 1]		
 		diagweightr6(SSE1, SSE2, SSE3, SSE0, [esi], [esi + 2*ebx + 2], SSE7)
 		diagweightr6(SSE4, SSE5, SSE6, SSE0, [esi + 2*ebx], [esi + 2], SSE7)
 		merge2weighted(SSE1, SSE2, SSE3, SSE4, SSE5, SSE6)
 		diagweightr6(SSE4, SSE5, SSE6, SSE0, [esi + 2*ebx + 1], [esi + 1], SSE7)
 		merge2weighted(SSE1, SSE2, SSE3, SSE4, SSE5, SSE6)
 		diagweightwr6(SSE4, SSE5, SSE6, SSE0, [esi + ebx + 2], [esi + ebx], [edi], SSE7)
 		merge2weighted(SSE1, SSE2, SSE3, SSE4, SSE5, SSE6)
 #ifdef	SHLUR
 		sharpen(SSE0, SSE1, SSE2, rshift[eax], shift_mask[eax], SSE7, SSE3)
 __asm	SSE_MOVE	[edi + 1],			SSE0
 #else
 __asm	pmaxub		SSE1,				SSE0
 __asm	pminub		SSE1,				SSE2
 __asm	SSE_MOVE	[edi + 1],			SSE1
 #endif
 // now the pixels in the middle
 __asm	add			esi,				SSE_INCREMENT
 __asm	add			edi,				SSE_INCREMENT + 1
 __asm	mov			ecx,				hblocks
 #endif	// MODIFYPLUGIN
 __asm	align		16
 __asm	middle_loop:
 __asm	SSE3_MOVE	SSE0,				[esi + ebx + 1]
 		diagweightr6(SSE1, SSE2, SSE3, SSE0, [esi], [esi + 2*ebx + 2], SSE7)
 		diagweightr6(SSE4, SSE5, SSE6, SSE0, [esi + 2*ebx], [esi + 2], SSE7)
 		merge2weighted(SSE1, SSE2, SSE3, SSE4, SSE5, SSE6)
 		diagweightr6(SSE4, SSE5, SSE6, SSE0, [esi + 2*ebx + 1], [esi + 1], SSE7)
 		merge2weighted(SSE1, SSE2, SSE3, SSE4, SSE5, SSE6)
 		diagweightr6(SSE4, SSE5, SSE6, SSE0, [esi + ebx + 2], [esi + ebx], SSE7)
 		merge2weighted(SSE1, SSE2, SSE3, SSE4, SSE5, SSE6)
 __asm	add			esi,				SSE_INCREMENT
 #ifdef	MODIFYPLUGIN
 #if		ISSE > 1
 __asm	SSE3_MOVE	SSE7,				[edi]
 #endif
 #if	MODIFYPLUGIN > 0
 __asm	pminub		SSE1,				SSE0
 __asm	pmaxub		SSE2,				SSE0
 #endif
 #if		ISSE > 1
 __asm	pmaxub		SSE1,				SSE7
 #else
 __asm	pmaxub		SSE1,				[edi]
 #endif
 __asm	pminub		SSE1,				SSE2
 __asm	SSE_MOVE	[edi],				SSE1
 #else	// MODIFYPLUGIN
 #ifdef	SHLUR
 		sharpen(SSE0, SSE1, SSE2, rshift[eax], shift_mask[eax], SSE7, SSE3)
 __asm	SSE_MOVE	[edi],				SSE0
 #else
 __asm	pmaxub		SSE1,				SSE0
 __asm	pminub		SSE1,				SSE2
 __asm	SSE_MOVE	[edi],				SSE1
 #endif
 #endif	// MODIFYPLUGIN
 __asm	add			edi,				SSE_INCREMENT
 __asm	dec			ecx
 __asm	jnz			middle_loop
 // the last pixels
 __asm	add			esi,				edx
 __asm	add			edi,				edx
 __asm	SSE3_MOVE	SSE0,				[esi + ebx + 1]		
 		diagweightr6(SSE1, SSE2, SSE3, SSE0, [esi], [esi + 2*ebx + 2], SSE7)
 		diagweightr6(SSE4, SSE5, SSE6, SSE0, [esi + 2*ebx], [esi + 2], SSE7)
 		merge2weighted(SSE1, SSE2, SSE3, SSE4, SSE5, SSE6)
 		diagweightr6(SSE4, SSE5, SSE6, SSE0, [esi + 2*ebx + 1], [esi + 1], SSE7)
 		merge2weighted(SSE1, SSE2, SSE3, SSE4, SSE5, SSE6)
 		diagweightwr6(SSE4, SSE5, SSE6, SSE0, [esi + ebx], [esi + ebx + 2], [edi + 1], SSE7)
 		merge2weighted(SSE1, SSE2, SSE3, SSE4, SSE5, SSE6)
 #ifdef	MODIFYPLUGIN
 #if		ISSE > 1
 __asm	SSE3_MOVE	SSE7,				[edi]
 #endif
 #if	MODIFYPLUGIN > 0
 __asm	pminub		SSE1,				SSE0
 __asm	pmaxub		SSE2,				SSE0
 #endif
 #if		ISSE > 1
 __asm	pmaxub		SSE1,				SSE7
 #else
 __asm	pmaxub		SSE1,				[edi]
 #endif
 __asm	pminub		SSE1,				SSE2	
 __asm	SSE_MOVE	[edi],				SSE1
 #else	// MODIFYPLUGIN
 #ifdef	SHLUR
 		sharpen(SSE0, SSE1, SSE2, rshift[eax], shift_mask[eax], SSE7, SSE3)
 __asm	SSE_MOVE	[edi],				SSE0
 #else
 __asm	pmaxub		SSE1,				SSE0
 __asm	pminub		SSE1,				SSE2	
 __asm	SSE_MOVE	[edi],				SSE1
 #endif
 #endif	// MODIFYPLUGIN
 #ifdef	SHARPEN
 __asm	add			esi,				spitch
 #else
 __asm	add			esi,				eax
 #endif
 __asm	add			edi,				dpitch
 __asm	dec			height
 #ifdef	MODIFYPLUGIN
 __asm	mov			ecx,				hblocks
 __asm	jnz			middle_loop
 #else
 __asm	jnz			column_loop
 #endif
 }
 #endif	// SHLUR

 #define		diag18(val1, val2, weight, center, bound1, bound2, reg)	\
 __asm	SSE3_MOVE	val1,				bound1			\
 __asm	SSE_RMOVE	weight,				center			\
 __asm	SSE_RMOVE	reg,				val1			\
 __asm	psubusb		weight,				val1			\
 __asm	psubusb		reg,				center			\
 __asm	SSE3_MOVE	val2,				bound2			\
 __asm	pmaxub		weight,				reg				\
 __asm	SSE_RMOVE	reg,				center			\
 __asm	psubusb		reg,				val2			\
 __asm	pmaxub		weight,				reg				\
 __asm	SSE_RMOVE	reg,				val2			\
 __asm	psubusb		reg,				center			\
 __asm	pmaxub		weight,				reg	

 #ifdef	SHARPEN
 void	SSE_Repair18(BYTE *dp, int dpitch, const BYTE *_sp, int spitch, int hblocks, int remainder, int incpitch, int height, int strength)
 #else
 void	SSE_Repair18(BYTE *dp, int dpitch, const BYTE *_sp, int spitch, int hblocks, int remainder, int incpitch, int height)
 #endif
 {
 __asm	mov			eax,				hblocks
 __asm	mov			ebx,				spitch
 #ifdef	MODIFYPLUGIN
 __asm	mov			ecx,				eax
 #endif
 __asm	mov			edx,				remainder
 #if		SSE_INCREMENT == 16
 __asm	add			eax,				eax
 #endif
 __asm	mov			esi,				_sp
 #ifdef	MODIFYPLUGIN
 __asm	lea			eax,				[eax * 8 + edx]
 #else
 __asm	lea			eax,				[eax * 8 + edx + SSE_INCREMENT + 1]
 #endif
 __asm	sub			esi,				ebx
 __asm	sub			dpitch,				eax
 __asm	neg			eax
 __asm	mov			edi,				dp
 #ifdef	MODIFYPLUGIN
 __asm	inc			edi
 __asm	lea			eax,				[ebx + eax]	
 #else
 __asm	lea			eax,				[ebx + eax + 1]	
 #ifdef	SHARPEN
 __asm	mov			spitch,				eax
 __asm	mov			eax,				strength
 #endif
 __asm	align		16
 __asm	column_loop:
 __asm	SSE3_MOVE	SSE0,				[esi + ebx + 1]		
 		diag18(SSE1, SSE2, SSE3, SSE0, [esi], [esi + 2*ebx + 2], SSE7)
 		diag18(SSE4, SSE5, SSE6, SSE0, [esi + 2*ebx], [esi + 2], SSE7)
 		merge2weighted(SSE1, SSE2, SSE3, SSE4, SSE5, SSE6)
 		diag18(SSE4, SSE5, SSE6, SSE0, [esi + 2*ebx + 1], [esi + 1], SSE7)
 		merge2weighted(SSE1, SSE2, SSE3, SSE4, SSE5, SSE6)
 		diag18(SSE4, SSE5, SSE6, SSE0, [esi + ebx + 2], [esi + ebx], SSE7)
 __asm	movd		[edi],				SSE5		
 		merge2weighted(SSE1, SSE2, SSE3, SSE4, SSE5, SSE6)
 __asm	SSE_RMOVE	SSE7,				SSE1
 __asm	pminub		SSE1,				SSE2
 __asm	pmaxub		SSE7,				SSE2
 #ifdef	SHLUR
 		sharpen(SSE0, SSE1, SSE7, rshift[eax], shift_mask[eax], SSE3, SSE4)
 __asm	SSE_MOVE	[edi + 1],				SSE0
 #else
 __asm	pmaxub		SSE1,				SSE0
 __asm	pminub		SSE1,				SSE7
 __asm	SSE_MOVE	[edi + 1],			SSE1
 #endif
 // now the pixels in the middle
 __asm	add			esi,				SSE_INCREMENT
 __asm	add			edi,				SSE_INCREMENT + 1
 __asm	mov			ecx,				hblocks
 #endif	// MODIFYPLUGIN
 __asm	align		16
 __asm	middle_loop:
 __asm	SSE3_MOVE	SSE0,				[esi + ebx + 1]
 		diag18(SSE1, SSE2, SSE3, SSE0, [esi], [esi + 2*ebx + 2], SSE7)
 		diag18(SSE4, SSE5, SSE6, SSE0, [esi + 2*ebx], [esi + 2], SSE7)
 		merge2weighted(SSE1, SSE2, SSE3, SSE4, SSE5, SSE6)
 		diag18(SSE4, SSE5, SSE6, SSE0, [esi + 2*ebx + 1], [esi + 1], SSE7)
 		merge2weighted(SSE1, SSE2, SSE3, SSE4, SSE5, SSE6)
 		diag18(SSE4, SSE5, SSE6, SSE0, [esi + ebx + 2], [esi + ebx], SSE7)
 		merge2weighted(SSE1, SSE2, SSE3, SSE4, SSE5, SSE6)
 __asm	add			esi,				SSE_INCREMENT
 __asm	SSE_RMOVE	SSE6,				SSE1
 #ifdef	MODIFYPLUGIN
 #if		ISSE > 1
 __asm	SSE3_MOVE	SSE7,				[edi]
 #endif
 __asm	pminub		SSE1,				SSE2
 __asm	pmaxub		SSE6,				SSE2
 #if	MODIFYPLUGIN > 0
 __asm	pminub		SSE1,				SSE0
 __asm	pmaxub		SSE6,				SSE0
 #endif
 #if		ISSE > 1
 __asm	pmaxub		SSE1,				SSE7
 #else
 __asm	pmaxub		SSE1,				[edi]
 #endif
 __asm	pminub		SSE1,				SSE6
 __asm	SSE_MOVE	[edi],				SSE1
 #else	// MODIFYPLUGIN
 __asm	pminub		SSE1,				SSE2
 __asm	pmaxub		SSE6,				SSE2
 #ifdef	SHLUR
 		sharpen(SSE0, SSE1, SSE6, rshift[eax], shift_mask[eax], SSE3, SSE4)
 __asm	SSE_MOVE	[edi],				SSE0
 #else
 __asm	pmaxub		SSE1,				SSE0
 __asm	pminub		SSE1,				SSE6
 __asm	SSE_MOVE	[edi],				SSE1
 #endif
 #endif	// MODIFYPLUGIN
 __asm	add			edi,				SSE_INCREMENT
 __asm	dec			ecx
 __asm	jnz			middle_loop
 // the last pixels
 __asm	add			esi,				edx
 __asm	add			edi,				edx
 __asm	SSE3_MOVE	SSE0,				[esi + ebx + 1]		
 		diag18(SSE1, SSE2, SSE3, SSE0, [esi], [esi + 2*ebx + 2], SSE7)
 		diag18(SSE4, SSE5, SSE6, SSE0, [esi + 2*ebx], [esi + 2], SSE7)
 		merge2weighted(SSE1, SSE2, SSE3, SSE4, SSE5, SSE6)
 		diag18(SSE4, SSE5, SSE6, SSE0, [esi + 2*ebx + 1], [esi + 1], SSE7)
 		merge2weighted(SSE1, SSE2, SSE3, SSE4, SSE5, SSE6)
 		diag18(SSE4, SSE5, SSE6, SSE0, [esi + ebx], [esi + ebx + 2], SSE7)
 #ifndef	MODIFYPLUGIN
 __asm	SSE_MOVE	[edi + 1],			SSE5
 #endif
 		merge2weighted(SSE1, SSE2, SSE3, SSE4, SSE5, SSE6)
 __asm	SSE_RMOVE	SSE6,				SSE1
 #ifdef	MODIFYPLUGIN
 #if		ISSE > 1
 __asm	SSE3_MOVE	SSE7,				[edi]
 #endif
 __asm	pminub		SSE1,				SSE2
 __asm	pmaxub		SSE6,				SSE2
 #if	MODIFYPLUGIN > 0
 __asm	pminub		SSE1,				SSE0
 __asm	pmaxub		SSE6,				SSE0
 #endif
 #if		ISSE > 1
 __asm	pmaxub		SSE1,				SSE7
 #else
 __asm	pmaxub		SSE1,				[edi]
 #endif
 __asm	pminub		SSE1,				SSE6	
 __asm	SSE_MOVE	[edi],				SSE1
 #else	// MODIFYPLUGIN
 __asm	pminub		SSE1,				SSE2
 __asm	pmaxub		SSE6,				SSE2
 #ifdef	SHLUR
 		sharpen(SSE0, SSE1, SSE6, rshift[eax], shift_mask[eax], SSE3, SSE4)
 __asm	SSE_MOVE	[edi],				SSE0
 #else
 __asm	pmaxub		SSE1,				SSE0
 __asm	pminub		SSE1,				SSE6	
 __asm	SSE_MOVE	[edi],				SSE1
 #endif
 #endif	// MODIFYPLUGIN
 #ifdef	SHARPEN
 __asm	add			esi,				spitch
 #else
 __asm	add			esi,				eax
 #endif
 __asm	add			edi,				dpitch
 __asm	dec			height
 #ifdef	MODIFYPLUGIN
 __asm	mov			ecx,				hblocks
 __asm	jnz			middle_loop
 #else
 __asm	jnz			column_loop
 #endif
 }


 #ifdef	SHARPEN
 static void	(*cleaning_methods[MAXMODE + 1])(BYTE *dp, int dpitch, const BYTE *sp, int spitch, int hblocks, int remainder, int incpitch, int height, int strength)
 		= { copy_plane, SSE_RemoveGrain1, SSE_RemoveGrain2, SSE_RemoveGrain3, SSE_RemoveGrain4, SSE_Repair15, SSE_Repair16, SSE_Repair17, SSE_Repair18a, diag9, copy_plane
 			, copy_plane, copy_plane, copy_plane, copy_plane, copy_plane, SmartRG, SSE_Repair18, copy_plane, copy_plane, copy_plane
 			, SmartAvgRGs, SmartAvgRGf
 		};
 #else
 static void	(*cleaning_methods[MAXMODE + 1])(BYTE *dp, int dpitch, const BYTE *sp, int spitch, int hblocks, int remainder, int incpitch, int height)
 #ifdef	MODIFYPLUGIN
 		= { do_nothing, SSE_RemoveGrain1, SSE_RemoveGrain2, SSE_RemoveGrain3, SSE_RemoveGrain4, diag5, diag6, diag7, diag8, diag9
 			, SSE_RemoveGrain10, SSE_RemoveGrain1, SSE_Repair12, SSE_Repair13, SSE_Repair14, SSE_Repair15, SSE_Repair16, SmartRG, SSE_Repair18};
 #elif	defined(BLUR)
 		= { copy_plane, SSE_RemoveGrain1, SSE_RemoveGrain2, SSE_RemoveGrain3, SSE_RemoveGrain4, copy_plane, copy_plane, copy_plane, copy_plane, diag9, copy_plane
 			, copy_plane, copy_plane, copy_plane, copy_plane, copy_plane, copy_plane, copy_plane, copy_plane, copy_plane, copy_plane
 			, SmartAvgRGs, SmartAvgRGf
 		};
 #else
 		= { copy_plane, SSE_RemoveGrain1, SSE_RemoveGrain2, SSE_RemoveGrain3, SSE_RemoveGrain4, diag5, diag6, diag7, diag8, diag9
 			, SSE_RemoveGrain10, SSE_RemoveGrain11, SSE_RemoveGrain12, bob_top, bob_bottom, smartbob_top, smartbob_bottom, SmartRG, SSE_Repair18, SSE_RemoveGrain19, SSE_RemoveGrain20
 			, SmartAvgRGs, SmartAvgRGf, SSE_RemoveGrain23, SSE_RemoveGrain24, nondestructivesharpen, SmartRGC, SmartRGCL, SmartRGCL2};
 #endif
 #endif	// SHARPEN

 class	RemoveGrain : public GenericVideoFilter, public PlanarAccess
 {
 #ifdef	MODIFYPLUGIN
 	PClip	oclip;
 #endif
 	
 	int		height2[3], hblocks[3], remainder[3], incpitch[3];

 #ifdef	SHARPEN
 	int		strength[3];

 	void	(*cleanf[3])(BYTE *dp, int dpitch, const BYTE *sp, int spitch, int hblocks, int remainder, int incpitch, int height, int strength);
 #else
 	void	(*cleanf[3])(BYTE *dp, int dpitch, const BYTE *sp, int spitch, int hblocks, int remainder, int incpitch, int height);
 #endif

 private:

 	PVideoFrame __stdcall GetFrame(int n, IScriptEnvironment* env)
 	{
 		PVideoFrame sf = child->GetFrame(n, env);
 #ifdef	MODIFYPLUGIN
 		PVideoFrame of = oclip->GetFrame(n, env);
 #endif
 		PVideoFrame df = env->NewVideoFrame(vi);
 		
 		int	i = planes; 
 		do
 		{
 			BYTE* dp = GetWritePtr(df, i);
 			int	dpitch = GetPitch(df, i);
 #ifdef	MODIFYPLUGIN
 			int	opitch = GetPitch(of, i);
 			// copy the plane from sp to dp
 			env->BitBlt(dp, dpitch, GetReadPtr(sf, i), GetPitch(sf, i), width[i], height[i]);
 			cleanf[i](dp + dpitch, dpitch, GetReadPtr(of, i) + opitch, opitch, hblocks[i], remainder[i], incpitch[i], height2[i]);
 #else	// MODIFYPLUGIN
 			const BYTE* sp = GetReadPtr(sf, i);
 			int	spitch = GetPitch(sf, i);
 			// copy the first line
 			memcpy(dp, sp, width[i]);
 			dp += dpitch;
 			sp += spitch; 
 #ifdef	SHARPEN
 			cleanf[i](dp, dpitch, sp, spitch, hblocks[i], remainder[i], incpitch[i], height2[i], strength[i]);
 #else
 			cleanf[i](dp, dpitch, sp, spitch, hblocks[i], remainder[i], incpitch[i], height2[i]);
 #endif
 			// copy the last line
 			memcpy(dp + height2[i] * dpitch, sp + height2[i] * spitch, width[i]);
 #endif	// MODIFYPLUGIN
 		} while( --i >= 0 );
 		SSE_EMMS
 		return df;
 	}
 public:
 #ifdef	MODIFYPLUGIN
 	RemoveGrain(PClip clip, PClip _oclip, int *mode, bool planar) : GenericVideoFilter(clip), PlanarAccess(vi), oclip(_oclip)
 #elif	defined(SHARPEN)
 	RemoveGrain(PClip clip, int *mode, int *_strength, bool planar) : GenericVideoFilter(clip), PlanarAccess(vi)
 #else
 	RemoveGrain(PClip clip, int *mode, bool planar) : GenericVideoFilter(clip), PlanarAccess(vi)
 #endif
 	{
 		if( vi.IsYV12() + planar == 0 )
 #ifdef	MODIFYPLUGIN
 			AVSenvironment->ThrowError("Repair: only planar color spaces are supported");
 		CompareVideoInfo(vi, oclip->GetVideoInfo(), "Repair");;
 		oclip->SetCacheHints(CACHE_NOTHING, 0);
 #else
 			AVSenvironment->ThrowError("RemoveGrain: only planar color spaces are supported");
 #endif
 		child->SetCacheHints(CACHE_NOTHING, 0);

 		if( mode[2] < 0 )
 		{
 			planes--;
 			if( mode[1] < 0 ) planes--;
 		}
 		
 		if( mode[1] < 0 ) mode[1] = 0;

 		int	i = planes;
 		do
 		{
 #ifdef	SHARPEN
 			strength[i] = (_strength[i] > MAXSTRENGTH ? MAXSTRENGTH : _strength[i]) * SSE_INCREMENT;
 #endif
 			if( mode[i] > MAXMODE ) AVSenvironment->ThrowError("RemoveGrain: invalid mode %u", mode[i]);
 			if( mode[i] < 0 ) cleanf[i] = do_nothing;
 			else cleanf[i] = cleaning_methods[mode[i]];
 			height2[i] = height[i] - 2;	
 			incpitch[i] = (SSE_INCREMENT + 2) - width[i];
 #ifdef	MODIFYPLUGIN
 			unsigned	w = width[i] - 3;	
 #else
 			unsigned	w = width[i] - 3 - SSE_INCREMENT;
 #endif
 			hblocks[i] = w / SSE_INCREMENT;
 			remainder[i] = (w & (SSE_INCREMENT - 1)) - (SSE_INCREMENT - 1);
 			//debug_printf("hblocks = %u, remainder = %i\n", hblocks[i], remainder[i]);
 		} while( --i >= 0 );
 		if( (hblocks[planes] <= 0) || (height2[planes] <= 0) ) 
 				AVSenvironment->ThrowError("RemoveGrain: the width or height of the clip is too small");
 	}
 	//~RemoveGrain(){}
 };


 AVSValue __cdecl CreateRemoveGrain(AVSValue args, void* user_data, IScriptEnvironment* env)
 {
 #ifdef	MODIFYPLUGIN
 	enum ARGS { CLIP, OCLIP, MY, MU, MV, PLANAR};
 #elif	defined(SHARPEN)
 	enum ARGS { CLIP, MY, MU, MV, SY, SU, SV, PLANAR};
 #else
 	enum ARGS { CLIP, MY, MU, MV, PLANAR};
 #endif
 	int		mode[3];
 	mode[0] = args[MY].AsInt(DEFAULT_MODE);
 	mode[1] = args[MU].AsInt(mode[0]);
 	mode[2] = args[MV].AsInt(mode[1]);
 #ifdef	SHARPEN
 	int		strength[3];
 	strength[0] = args[SY].AsInt(DEFAULT_STRENGTH);
 	strength[1] = args[SU].AsInt(strength[0]);
 	strength[2] = args[SV].AsInt(strength[1]);
 #endif
 #ifdef	MODIFYPLUGIN
 	return  new RemoveGrain(args[CLIP].AsClip(), args[OCLIP].AsClip(), mode, args[PLANAR].AsBool(false));
 #elif	defined(SHARPEN)
 	return  new RemoveGrain(args[CLIP].AsClip(), mode, strength, args[PLANAR].AsBool(false));
 #else
 	return  new RemoveGrain(args[CLIP].AsClip(), mode, args[PLANAR].AsBool(false));
 #endif
 }

 #ifdef	MODIFYPLUGIN

 #if	ISSE > 1
 #define	RepairPixel(dest, src1, src2, previous, next, reg1, reg2, reg3, reg4)	\
 __asm	SSE3_MOVE	reg1,			next				\
 __asm	SSE3_MOVE	reg3,			previous			\
 __asm	SSE_RMOVE	reg2,			reg1				\
 __asm	SSE3_MOVE	reg4,			src2				\
 __asm	pminub		reg1,			reg3				\
 __asm	pmaxub		reg2,			reg3				\
 __asm	pminub		reg1,			reg4				\
 __asm	SSE3_MOVE	reg3,			src1				\
 __asm	pmaxub		reg2,			reg4				\
 __asm	pmaxub		reg1,			reg3				\
 __asm	pminub		reg1,			reg2				\
 __asm	SSE_MOVE	dest,			reg1
 #else
 #define	RepairPixel(dest, src1, src2, previous, next, reg1, reg2, reg3, reg4)	\
 __asm	SSE3_MOVE	reg1,			next				\
 __asm	SSE3_MOVE	reg3,			previous			\
 __asm	SSE_RMOVE	reg2,			reg1				\
 __asm	SSE3_MOVE	reg4,			src2				\
 __asm	pminub		reg1,			reg3				\
 __asm	pmaxub		reg2,			reg3				\
 __asm	pminub		reg1,			reg4				\
 __asm	pmaxub		reg2,			reg4				\
 __asm	pmaxub		reg1,			src1				\
 __asm	pminub		reg1,			reg2				\
 __asm	SSE_MOVE	dest,			reg1
 #endif

 static	void	temporal_repair(BYTE *dp, int dpitch, const BYTE *sp1, int spitch1, const BYTE *sp2, int spitch2, const BYTE *pp, int ppitch, const BYTE *np, int npitch, int width, int height)
 {
 	int	blocks = --width / SSE_INCREMENT;
 	int	remainder = (width & (SSE_INCREMENT - 1)) - (SSE_INCREMENT - 1);
 	width -= SSE_INCREMENT - 1;
 	dpitch -= width;
 	spitch1 -= width;
 	spitch2 -= width;
 	ppitch -= width;
 	npitch -= width; 
 __asm	mov			ebx,				pp
 __asm	mov			edx,				sp1
 __asm	mov			esi,				sp2
 __asm	mov			edi,				dp
 __asm	mov			eax,				np
 __asm	mov			ecx,				blocks
 __asm	align		16
 __asm	_loop:
 		RepairPixel([edi], [edx], [esi], [ebx], [eax], SSE0, SSE1, SSE2, SSE3)
 __asm	add			eax,				SSE_INCREMENT
 __asm	add			esi,				SSE_INCREMENT
 __asm	add			edi,				SSE_INCREMENT
 __asm	add			edx,				SSE_INCREMENT
 __asm	add			ebx,				SSE_INCREMENT
 __asm	loop		_loop
 // the last pixels
 __asm	add			esi,				remainder
 __asm	add			edi,				remainder
 __asm	mov			ecx,				blocks
 __asm	add			ebx,				remainder
 __asm	add			eax,				remainder
 		RepairPixel([edi], [edx], [esi], [ebx], [eax], SSE0, SSE1, SSE2, SSE3) 
 __asm	add			esi,				spitch2
 __asm	add			edi,				dpitch
 __asm	add			edx,				spitch1
 __asm	add			ebx,				ppitch
 __asm	add			eax,				npitch
 __asm	dec			height
 __asm	jnz			_loop
 }

 #if	ISSE > 1
 #define	BRepairPixel(dest, src1, src2, previous, next, reg1, reg2, reg3, reg4, reg5, reg6)	\
 __asm	SSE3_MOVE	reg1,			next				\
 __asm	SSE3_MOVE	reg3,			previous			\
 __asm	SSE_RMOVE	reg2,			reg1				\
 __asm	SSE3_MOVE	reg4,			src2				\
 __asm	pmaxub		reg2,			reg3				\
 __asm	SSE_RMOVE	reg5,			reg4				\
 __asm	pminub		reg1,			reg3				\
 __asm	SSE_RMOVE	reg6,			reg2				\
 __asm	psubusb		reg5,			reg1				\
 __asm	psubusb		reg6,			reg4				\
 __asm	SSE_RMOVE	reg3,			reg2				\
 __asm	paddusb		reg5,			reg5				\
 __asm	paddusb		reg6,			reg6				\
 __asm	paddusb		reg5,			reg1				\
 __asm	psubusb		reg3,			reg6				\
 __asm	pminub		reg5,			reg2				\
 __asm	pmaxub		reg3,			reg1				\
 __asm	SSE3_MOVE	reg6,			src1				\
 __asm	pcmpeqb		reg1,			reg5				\
 __asm	pcmpeqb		reg2,			reg3				\
 __asm	pminub		reg5,			reg6				\
 __asm	pmaxub		reg1,			reg2				\
 __asm	pmaxub		reg5,			reg3				\
 __asm	pminub		reg4,			reg1				\
 __asm	psubusb		reg5,			reg1				\
 __asm	pmaxub		reg4,			reg5				\
 __asm	SSE_MOVE	dest,			reg4
 #else
 #define	BRepairPixel(dest, src1, src2, previous, next, reg1, reg2, reg3, reg4, reg5, reg6)	\
 __asm	SSE3_MOVE	reg1,			next				\
 __asm	SSE3_MOVE	reg3,			previous			\
 __asm	SSE_RMOVE	reg2,			reg1				\
 __asm	SSE3_MOVE	reg4,			src2				\
 __asm	pmaxub		reg2,			reg3				\
 __asm	SSE_RMOVE	reg5,			reg4				\
 __asm	pminub		reg1,			reg3				\
 __asm	SSE_RMOVE	reg6,			reg2				\
 __asm	psubusb		reg5,			reg1				\
 __asm	psubusb		reg6,			reg4				\
 __asm	SSE_RMOVE	reg3,			reg2				\
 __asm	paddusb		reg5,			reg5				\
 __asm	paddusb		reg6,			reg6				\
 __asm	paddusb		reg5,			reg1				\
 __asm	psubusb		reg3,			reg6				\
 __asm	pminub		reg5,			reg2				\
 __asm	pmaxub		reg3,			reg1				\
 __asm	pcmpeqb		reg1,			reg5				\
 __asm	pcmpeqb		reg2,			reg3				\
 __asm	pminub		reg5,			src1				\
 __asm	pmaxub		reg1,			reg2				\
 __asm	pmaxub		reg5,			reg3				\
 __asm	pminub		reg4,			reg1				\
 __asm	psubusb		reg5,			reg1				\
 __asm	pmaxub		reg4,			reg5				\
 __asm	SSE_MOVE	dest,			reg4
 #endif	// ISSE > 1

 static	void	btemporal_repair(BYTE *dp, int dpitch, const BYTE *sp1, int spitch1, const BYTE *sp2, int spitch2, const BYTE *pp, int ppitch, const BYTE *np, int npitch, int width, int height)
 {
 	int	blocks = --width / SSE_INCREMENT;
 	int	remainder = (width & (SSE_INCREMENT - 1)) - (SSE_INCREMENT - 1);
 	width -= SSE_INCREMENT - 1;
 	dpitch -= width;
 	spitch1 -= width;
 	spitch2 -= width;
 	ppitch -= width;
 	npitch -= width; 
 __asm	mov			ebx,				pp
 __asm	mov			edx,				sp1
 __asm	mov			esi,				sp2
 __asm	mov			edi,				dp
 __asm	mov			eax,				np
 __asm	mov			ecx,				blocks
 __asm	align		16
 __asm	_loop:
 		BRepairPixel([edi], [edx], [esi], [ebx], [eax], SSE0, SSE1, SSE2, SSE3, SSE4, SSE5)
 __asm	add			eax,				SSE_INCREMENT
 __asm	add			esi,				SSE_INCREMENT
 __asm	add			edi,				SSE_INCREMENT
 __asm	add			edx,				SSE_INCREMENT
 __asm	add			ebx,				SSE_INCREMENT
 __asm	loop		_loop
 // the last pixels
 __asm	add			esi,				remainder
 __asm	add			edi,				remainder
 __asm	mov			ecx,				blocks
 __asm	add			ebx,				remainder
 __asm	add			eax,				remainder
 		BRepairPixel([edi], [edx], [esi], [ebx], [eax], SSE0, SSE1, SSE2, SSE3, SSE4, SSE5) 
 __asm	add			esi,				spitch2
 __asm	add			edi,				dpitch
 __asm	add			edx,				spitch1
 __asm	add			ebx,				ppitch
 __asm	add			eax,				npitch
 __asm	dec			height
 __asm	jnz			_loop
 }

 class	TemporalRepair : public GenericVideoFilter, public PlanarAccess
 {
 	void			(*trepair)(BYTE *dp, int dpitch, const BYTE *sp1, int spitch1, const BYTE *sp2, int spitch2, const BYTE *pp, int ppitch, const BYTE *np, int npitch, int width, int height);
 	unsigned		last_frame;
 	PClip			orig;
 	
 	PVideoFrame __stdcall GetFrame(int n, IScriptEnvironment* env)
 	{
 		if( ((unsigned)(n - 1) >= last_frame) ) return child->GetFrame(n, env);
 		PVideoFrame	pf = orig->GetFrame(n - 1, env);
 		PVideoFrame	sf = orig->GetFrame(n, env);
 		PVideoFrame	nf = orig->GetFrame(n + 1, env);
 		PVideoFrame cf = child->GetFrame(n, env);
 		PVideoFrame	df = env->NewVideoFrame(vi);

 		int i = planes;
 		do
 		{
 			trepair(GetWritePtr(df, i), GetPitch(df, i), GetReadPtr(cf, i), GetPitch(cf, i), GetReadPtr(sf, i), GetPitch(sf, i), GetReadPtr(pf, i), GetPitch(pf, i), GetReadPtr(nf, i), GetPitch(nf, i), width[i], height[i]);
 		} while( --i >= 0 );
 		SSE_EMMS
 		return df;
 	}

 public:
 	TemporalRepair(PClip clip, PClip oclip, int mode, bool grey, bool planar) : GenericVideoFilter(clip), PlanarAccess(vi, planar && grey), orig(oclip)
 	{
 		CompareVideoInfo(vi, orig->GetVideoInfo(), "TemporalRepair");
 		child->SetCacheHints(CACHE_RANGE, 0);
 		orig->SetCacheHints(CACHE_RANGE, 2);
 		trepair = mode ? btemporal_repair : temporal_repair;
 		last_frame = vi.num_frames - 2;
 		if( (int) last_frame < 0 ) last_frame = 0;
 		if( grey ) planes = 0;
 	}

 	//~TemporalRepair(){}
 };

 #define	get_lu(lower, upper, previous, current, next, reg1, reg2)	\
 __asm	SSE3_MOVE	upper,			next				\
 __asm	SSE3_MOVE	reg1,			previous			\
 __asm	SSE_RMOVE	reg2,			upper				\
 __asm	SSE3_MOVE	lower,			current				\
 __asm	pmaxub		upper,			reg1				\
 __asm	pminub		reg2,			reg1				\
 __asm	psubusb		upper,			lower				\
 __asm	psubusb		lower,			reg2

 #if	ISSE > 1
 #define	SmoothTRepair(dest, lower, upper, previous, current, next, reg1, reg2)	\
 __asm	SSE3_MOVE	reg1,			current				\
 __asm	SSE3_MOVE	reg2,			previous			\
 __asm	paddusb		upper,			reg1				\
 __asm	psubusb		reg1,			lower				\
 __asm	pmaxub		upper,			reg2				\
 __asm	SSE3_MOVE	lower,			next				\
 __asm	pminub		reg1,			reg2				\
 __asm	pmaxub		upper,			lower				\
 __asm	SSE3_MOVE	reg2,			dest				\
 __asm	pminub		reg1,			lower				\
 __asm	pminub		upper,			reg2				\
 __asm	pmaxub		upper,			reg1				\
 __asm	SSE_MOVE	dest,			upper
 #else
 #define	SmoothTRepair(dest, lower, upper, previous, current, next, reg1, reg2)	\
 __asm	SSE3_MOVE	reg1,			current				\
 __asm	SSE3_MOVE	reg2,			previous			\
 __asm	paddusb		upper,			reg1				\
 __asm	psubusb		reg1,			lower				\
 __asm	pmaxub		upper,			reg2				\
 __asm	SSE3_MOVE	lower,			next				\
 __asm	pminub		reg1,			reg2				\
 __asm	pmaxub		upper,			lower				\
 __asm	pminub		reg1,			lower				\
 __asm	pminub		upper,			dest				\
 __asm	pmaxub		upper,			reg1				\
 __asm	SSE_MOVE	dest,			upper
 #endif

 void	smooth_temporal_repair1(BYTE *dp, const BYTE *previous, const BYTE *_sp, const BYTE *next, int pitch, int hblocks, int height, int remainder)
 {	
 __asm	mov			eax,				hblocks
 __asm	mov			ecx,				eax
 __asm	mov			edx,				previous
 __asm	mov			esi,				_sp
 __asm	shl			eax,				SSE_SHIFT
 __asm	mov			edi,				dp
 __asm	add			eax,				remainder
 __asm	mov			ebx,				pitch
 __asm	sub			pitch,				eax
 __asm	lea			edi,				[edi + ebx + 1]
 __asm	mov			eax,				next
 __asm	align		16
 __asm	middle_loop:
 		get_lu(SSE0, SSE1, [edx], [esi], [eax], SSE6, SSE7)
 		get_lu(SSE2, SSE3, [edx + 1], [esi + 1], [eax + 1], SSE6, SSE7)
 __asm	pmaxub		SSE1,				SSE3
 __asm	pmaxub		SSE0,				SSE2
 		get_lu(SSE2, SSE3, [edx + 2], [esi + 2], [eax + 2], SSE6, SSE7)
 __asm	pmaxub		SSE1,				SSE3
 __asm	pmaxub		SSE0,				SSE2
 		get_lu(SSE2, SSE3, [edx + 2*ebx], [esi + 2*ebx], [eax + 2*ebx], SSE6, SSE7)
 __asm	pmaxub		SSE1,				SSE3
 __asm	pmaxub		SSE0,				SSE2
 		get_lu(SSE2, SSE3, [edx + 2*ebx + 1], [esi + 2*ebx + 1], [eax + 2*ebx + 1], SSE6, SSE7)
 __asm	pmaxub		SSE1,				SSE3
 __asm	pmaxub		SSE0,				SSE2
 		get_lu(SSE2, SSE3, [edx + 2*ebx + 2], [esi + 2*ebx + 2], [eax + 2*ebx + 2], SSE6, SSE7)
 __asm	pmaxub		SSE1,				SSE3
 __asm	pmaxub		SSE0,				SSE2
 		get_lu(SSE2, SSE3, [edx + ebx], [esi + ebx], [eax + ebx], SSE6, SSE7)
 __asm	pmaxub		SSE1,				SSE3
 __asm	pmaxub		SSE0,				SSE2
 		get_lu(SSE2, SSE3, [edx + ebx + 2], [esi + ebx + 2], [eax + ebx + 2], SSE6, SSE7)
 __asm	pmaxub		SSE1,				SSE3
 __asm	pmaxub		SSE0,				SSE2
 		SmoothTRepair([edi], SSE0, SSE1, [edx + ebx + 1], [esi + ebx + 1], [eax + ebx + 1], SSE6, SSE7)
 __asm	add			esi,				SSE_INCREMENT
 __asm	add			edx,				SSE_INCREMENT
 __asm	add			eax,				SSE_INCREMENT
 __asm	add			edi,				SSE_INCREMENT
 __asm	dec			ecx
 __asm	jnz			middle_loop
 // the last pixels
 __asm	add			esi,				remainder
 __asm	add			edx,				remainder
 __asm	add			eax,				remainder
 __asm	add			edi,				remainder
 		get_lu(SSE0, SSE1, [edx], [esi], [eax], SSE6, SSE7)
 		get_lu(SSE2, SSE3, [edx + 1], [esi + 1], [eax + 1], SSE6, SSE7)
 __asm	pmaxub		SSE1,				SSE3
 __asm	pmaxub		SSE0,				SSE2
 		get_lu(SSE2, SSE3, [edx + 2], [esi + 2], [eax + 2], SSE6, SSE7)
 __asm	pmaxub		SSE1,				SSE3
 __asm	pmaxub		SSE0,				SSE2
 		get_lu(SSE2, SSE3, [edx + 2*ebx], [esi + 2*ebx], [eax + 2*ebx], SSE6, SSE7)
 __asm	pmaxub		SSE1,				SSE3
 __asm	pmaxub		SSE0,				SSE2
 		get_lu(SSE2, SSE3, [edx + 2*ebx + 1], [esi + 2*ebx + 1], [eax + 2*ebx + 1], SSE6, SSE7)
 __asm	pmaxub		SSE1,				SSE3
 __asm	pmaxub		SSE0,				SSE2
 		get_lu(SSE2, SSE3, [edx + 2*ebx + 2], [esi + 2*ebx + 2], [eax + 2*ebx + 2], SSE6, SSE7)
 __asm	pmaxub		SSE1,				SSE3
 __asm	pmaxub		SSE0,				SSE2
 		get_lu(SSE2, SSE3, [edx + ebx], [esi + ebx], [eax + ebx], SSE6, SSE7)
 __asm	pmaxub		SSE1,				SSE3
 __asm	pmaxub		SSE0,				SSE2
 		get_lu(SSE2, SSE3, [edx + ebx + 2], [esi + ebx + 2], [eax + ebx + 2], SSE6, SSE7)
 __asm	pmaxub		SSE1,				SSE3
 __asm	pmaxub		SSE0,				SSE2
 		SmoothTRepair([edi], SSE0, SSE1, [edx + ebx + 1], [esi + ebx + 1], [eax + ebx + 1], SSE6, SSE7)
 __asm	add			esi,				pitch
 __asm	add			edx,				pitch
 __asm	add			eax,				pitch
 __asm	add			edi,				pitch
 __asm	dec			height
 __asm	mov			ecx,				hblocks
 __asm	jnz			middle_loop
 }

 #ifdef	SMOOTH2

 #define	get_lu_reg(lower, upper, previous, current, next, reg1, reg2)	\
 __asm	SSE3_MOVE	upper,			next				\
 __asm	SSE3_MOVE	reg1,			previous			\
 __asm	SSE_RMOVE	reg2,			upper				\
 __asm	SSE_RMOVE	lower,			current				\
 __asm	pmaxub		upper,			reg1				\
 __asm	pminub		reg2,			reg1				\
 __asm	psubusb		upper,			lower				\
 __asm	psubusb		lower,			reg2

 #if	ISSE > 1
 #define	SmoothTRepair2(dest, lower, upper, previous, current, next, reg1, reg2, reg3, reg4, reg5)	\
 __asm	SSE3_MOVE	reg1,			current				\
 		get_lu_reg(reg4, reg5, previous, reg1, next, reg2, reg3)	\
 __asm	pmaxub		upper,			reg5				\
 __asm	pmaxub		lower,			reg4				\
 __asm	SSE_RMOVE	reg2,			reg1				\
 __asm	pmaxub		upper,			lower				\
 __asm	SSE3_MOVE	reg3,			dest				\
 __asm	paddusb		reg1,			upper				\
 __asm	psubusb		reg2,			upper				\
 __asm	pminub		reg1,			reg3				\
 __asm	pmaxub		reg1,			reg2				\
 __asm	SSE_MOVE	dest,			reg1
 #else
 #define	SmoothTRepair2(dest, lower, upper, previous, current, next, reg1, reg2, reg3, reg4, reg5)	\
 __asm	SSE3_MOVE	reg1,			current				\
 		get_lu_reg(reg4, reg5, previous, reg1, next, reg2, reg3)	\
 __asm	pmaxub		upper,			reg5				\
 __asm	pmaxub		lower,			reg4				\
 __asm	SSE_RMOVE	reg2,			reg1				\
 __asm	pmaxub		upper,			lower				\
 __asm	paddusb		reg1,			upper				\
 __asm	psubusb		reg2,			upper				\
 __asm	pminub		reg1,			dest				\
 __asm	pmaxub		reg1,			reg2				\
 __asm	SSE_MOVE	dest,			reg1
 #endif

 void	smooth_temporal_repair2(BYTE *dp, const BYTE *previous, const BYTE *_sp, const BYTE *next, int pitch, int hblocks, int height, int remainder)
 {	
 __asm	mov			eax,				hblocks
 __asm	mov			ecx,				eax
 __asm	mov			edx,				previous
 __asm	mov			esi,				_sp
 __asm	shl			eax,				SSE_SHIFT
 __asm	mov			edi,				dp
 __asm	add			eax,				remainder
 __asm	mov			ebx,				pitch
 __asm	sub			pitch,				eax
 __asm	lea			edi,				[edi + ebx + 1]
 __asm	mov			eax,				next
 __asm	align		16
 __asm	middle_loop:
 		get_lu(SSE0, SSE1, [edx], [esi], [eax], SSE6, SSE7)
 		get_lu(SSE2, SSE3, [edx + 1], [esi + 1], [eax + 1], SSE6, SSE7)
 __asm	pmaxub		SSE1,				SSE3
 __asm	pmaxub		SSE0,				SSE2
 		get_lu(SSE2, SSE3, [edx + 2], [esi + 2], [eax + 2], SSE6, SSE7)
 __asm	pmaxub		SSE1,				SSE3
 __asm	pmaxub		SSE0,				SSE2
 		get_lu(SSE2, SSE3, [edx + 2*ebx], [esi + 2*ebx], [eax + 2*ebx], SSE6, SSE7)
 __asm	pmaxub		SSE1,				SSE3
 __asm	pmaxub		SSE0,				SSE2
 		get_lu(SSE2, SSE3, [edx + 2*ebx + 1], [esi + 2*ebx + 1], [eax + 2*ebx + 1], SSE6, SSE7)
 __asm	pmaxub		SSE1,				SSE3
 __asm	pmaxub		SSE0,				SSE2
 		get_lu(SSE2, SSE3, [edx + 2*ebx + 2], [esi + 2*ebx + 2], [eax + 2*ebx + 2], SSE6, SSE7)
 __asm	pmaxub		SSE1,				SSE3
 __asm	pmaxub		SSE0,				SSE2
 		get_lu(SSE2, SSE3, [edx + ebx], [esi + ebx], [eax + ebx], SSE6, SSE7)
 __asm	pmaxub		SSE1,				SSE3
 __asm	pmaxub		SSE0,				SSE2
 		get_lu(SSE2, SSE3, [edx + ebx + 2], [esi + ebx + 2], [eax + ebx + 2], SSE6, SSE7)
 __asm	pmaxub		SSE1,				SSE3
 __asm	pmaxub		SSE0,				SSE2
 		SmoothTRepair2([edi], SSE0, SSE1, [edx + ebx + 1], [esi + ebx + 1], [eax + ebx + 1], SSE4, SSE5, SSE6, SSE7, SSE3)
 __asm	add			esi,				SSE_INCREMENT
 __asm	add			edx,				SSE_INCREMENT
 __asm	add			eax,				SSE_INCREMENT
 __asm	add			edi,				SSE_INCREMENT
 __asm	dec			ecx
 __asm	jnz			middle_loop
 // the last pixels
 __asm	add			esi,				remainder
 __asm	add			edx,				remainder
 __asm	add			eax,				remainder
 __asm	add			edi,				remainder
 		get_lu(SSE0, SSE1, [edx], [esi], [eax], SSE6, SSE7)
 		get_lu(SSE2, SSE3, [edx + 1], [esi + 1], [eax + 1], SSE6, SSE7)
 __asm	pmaxub		SSE1,				SSE3
 __asm	pmaxub		SSE0,				SSE2
 		get_lu(SSE2, SSE3, [edx + 2], [esi + 2], [eax + 2], SSE6, SSE7)
 __asm	pmaxub		SSE1,				SSE3
 __asm	pmaxub		SSE0,				SSE2
 		get_lu(SSE2, SSE3, [edx + 2*ebx], [esi + 2*ebx], [eax + 2*ebx], SSE6, SSE7)
 __asm	pmaxub		SSE1,				SSE3
 __asm	pmaxub		SSE0,				SSE2
 		get_lu(SSE2, SSE3, [edx + 2*ebx + 1], [esi + 2*ebx + 1], [eax + 2*ebx + 1], SSE6, SSE7)
 __asm	pmaxub		SSE1,				SSE3
 __asm	pmaxub		SSE0,				SSE2
 		get_lu(SSE2, SSE3, [edx + 2*ebx + 2], [esi + 2*ebx + 2], [eax + 2*ebx + 2], SSE6, SSE7)
 __asm	pmaxub		SSE1,				SSE3
 __asm	pmaxub		SSE0,				SSE2
 		get_lu(SSE2, SSE3, [edx + ebx], [esi + ebx], [eax + ebx], SSE6, SSE7)
 __asm	pmaxub		SSE1,				SSE3
 __asm	pmaxub		SSE0,				SSE2
 		get_lu(SSE2, SSE3, [edx + ebx + 2], [esi + ebx + 2], [eax + ebx + 2], SSE6, SSE7)
 __asm	pmaxub		SSE1,				SSE3
 __asm	pmaxub		SSE0,				SSE2
 		SmoothTRepair2([edi], SSE0, SSE1, [edx + ebx + 1], [esi + ebx + 1], [eax + ebx + 1], SSE4, SSE5, SSE6, SSE7, SSE3)
 __asm	add			esi,				pitch
 __asm	add			edx,				pitch
 __asm	add			eax,				pitch
 __asm	add			edi,				pitch
 __asm	dec			height
 __asm	mov			ecx,				hblocks
 __asm	jnz			middle_loop
 }

 #define	get2diff(pdiff, ndiff, previous, current, next, reg1, reg2, reg3)	\
 __asm	SSE3_MOVE	reg3,			current				\
 __asm	SSE3_MOVE	pdiff,			previous			\
 __asm	SSE_RMOVE	reg1,			reg3				\
 __asm	SSE3_MOVE	ndiff,			next				\
 __asm	SSE_RMOVE	reg2,			reg3				\
 __asm	psubusb		reg1,			pdiff				\
 __asm	psubusb		reg2,			ndiff				\
 __asm	psubusb		ndiff,			reg3				\
 __asm	psubusb		pdiff,			reg3				\
 __asm	pmaxub		pdiff,			reg1				\
 __asm	pmaxub		ndiff,			reg2

 #if	ISSE > 1
 #define	SmoothTRepair3(dest, pmax, nmax, previous, current, next, reg1, reg2, reg3, reg4, reg5)	\
 		get2diff(reg4, reg5, previous, current, next, reg2, reg3, reg1)	\
 __asm	pmaxub		pmax,			reg4				\
 __asm	pmaxub		nmax,			reg5				\
 __asm	SSE_RMOVE	reg2,			reg1				\
 __asm	pminub		pmax,			nmax				\
 __asm	SSE3_MOVE	reg3,			dest				\
 __asm	paddusb		reg1,			pmax				\
 __asm	psubusb		reg2,			pmax				\
 __asm	pminub		reg1,			reg3				\
 __asm	pmaxub		reg1,			reg2				\
 __asm	SSE_MOVE	dest,			reg1
 #else
 #define	SmoothTRepair3(dest, pmax, nmax, previous, current, next, reg1, reg2, reg3, reg4, reg5)	\
 		get2diff(reg4, reg5, previous, current, next, reg2, reg3, reg1)	\
 __asm	pmaxub		pmax,			reg4				\
 __asm	pmaxub		nmax,			reg5				\
 __asm	SSE_RMOVE	reg2,			reg1				\
 __asm	pminub		pmax,			nmax				\
 __asm	paddusb		reg1,			pmax				\
 __asm	psubusb		reg2,			pmax				\
 __asm	pminub		reg1,			dest				\
 __asm	pmaxub		reg1,			reg2				\
 __asm	SSE_MOVE	dest,			reg1
 #endif

 void	smooth_temporal_repair3(BYTE *dp, const BYTE *previous, const BYTE *_sp, const BYTE *next, int pitch, int hblocks, int height, int remainder)
 {	
 __asm	mov			eax,				hblocks
 __asm	mov			ecx,				eax
 __asm	mov			edx,				previous
 __asm	mov			esi,				_sp
 __asm	shl			eax,				SSE_SHIFT
 __asm	mov			edi,				dp
 __asm	add			eax,				remainder
 __asm	mov			ebx,				pitch
 __asm	sub			pitch,				eax
 __asm	lea			edi,				[edi + ebx + 1]
 __asm	mov			eax,				next
 __asm	align		16
 __asm	middle_loop:
 		get2diff(SSE0, SSE1, [edx], [esi], [eax], SSE5, SSE6, SSE7)
 		get2diff(SSE2, SSE3, [edx + 1], [esi + 1], [eax + 1], SSE5, SSE6, SSE7)
 __asm	pmaxub		SSE0,				SSE2
 __asm	pmaxub		SSE1,				SSE3
 		get2diff(SSE2, SSE3, [edx + 2], [esi + 2], [eax + 2], SSE5, SSE6, SSE7)
 __asm	pmaxub		SSE0,				SSE2
 __asm	pmaxub		SSE1,				SSE3
 		get2diff(SSE2, SSE3, [edx + 2*ebx], [esi + 2*ebx], [eax + 2*ebx], SSE5, SSE6, SSE7)
 __asm	pmaxub		SSE0,				SSE2
 __asm	pmaxub		SSE1,				SSE3
 		get2diff(SSE2, SSE3, [edx + 2*ebx + 1], [esi + 2*ebx + 1], [eax + 2*ebx + 1], SSE5, SSE6, SSE7)
 __asm	pmaxub		SSE0,				SSE2
 __asm	pmaxub		SSE1,				SSE3
 		get2diff(SSE2, SSE3, [edx + 2*ebx + 2], [esi + 2*ebx + 2], [eax + 2*ebx + 2], SSE5, SSE6, SSE7)
 __asm	pmaxub		SSE1,				SSE3
 		get2diff(SSE2, SSE3, [edx + ebx], [esi + ebx], [eax + ebx], SSE5, SSE6, SSE7)
 __asm	pmaxub		SSE0,				SSE2
 __asm	pmaxub		SSE1,				SSE3
 		get2diff(SSE2, SSE3, [edx + ebx + 2], [esi + ebx + 2], [eax + ebx + 2], SSE5, SSE6, SSE7)
 __asm	pmaxub		SSE0,				SSE2
 __asm	pmaxub		SSE1,				SSE3
 		SmoothTRepair3([edi], SSE0, SSE1, [edx + ebx + 1], [esi + ebx + 1], [eax + ebx + 1], SSE4, SSE5, SSE6, SSE7, SSE3)
 __asm	add			esi,				SSE_INCREMENT
 __asm	add			edx,				SSE_INCREMENT
 __asm	add			eax,				SSE_INCREMENT
 __asm	add			edi,				SSE_INCREMENT
 __asm	dec			ecx
 __asm	jnz			middle_loop
 // the last pixels
 __asm	add			esi,				remainder
 __asm	add			edx,				remainder
 __asm	add			eax,				remainder
 __asm	add			edi,				remainder
 		get2diff(SSE0, SSE1, [edx], [esi], [eax], SSE5, SSE6, SSE7)
 		get2diff(SSE2, SSE3, [edx + 1], [esi + 1], [eax + 1], SSE5, SSE6, SSE7)
 __asm	pmaxub		SSE0,				SSE2
 __asm	pmaxub		SSE1,				SSE3
 		get2diff(SSE2, SSE3, [edx + 2], [esi + 2], [eax + 2], SSE5, SSE6, SSE7)
 __asm	pmaxub		SSE0,				SSE2
 __asm	pmaxub		SSE1,				SSE3
 		get2diff(SSE2, SSE3, [edx + 2*ebx], [esi + 2*ebx], [eax + 2*ebx], SSE5, SSE6, SSE7)
 __asm	pmaxub		SSE0,				SSE2
 __asm	pmaxub		SSE1,				SSE3
 		get2diff(SSE2, SSE3, [edx + 2*ebx + 1], [esi + 2*ebx + 1], [eax + 2*ebx + 1], SSE5, SSE6, SSE7)
 __asm	pmaxub		SSE0,				SSE2
 __asm	pmaxub		SSE1,				SSE3
 		get2diff(SSE2, SSE3, [edx + 2*ebx + 2], [esi + 2*ebx + 2], [eax + 2*ebx + 2], SSE5, SSE6, SSE7)
 __asm	pmaxub		SSE0,				SSE2
 __asm	pmaxub		SSE1,				SSE3
 		get2diff(SSE2, SSE3, [edx + ebx], [esi + ebx], [eax + ebx], SSE5, SSE6, SSE7)
 __asm	pmaxub		SSE0,				SSE2
 __asm	pmaxub		SSE1,				SSE3
 		get2diff(SSE2, SSE3, [edx + ebx + 2], [esi + ebx + 2], [eax + ebx + 2], SSE5, SSE6, SSE7)
 __asm	pmaxub		SSE0,				SSE2
 __asm	pmaxub		SSE1,				SSE3
 		SmoothTRepair3([edi], SSE0, SSE1, [edx + ebx + 1], [esi + ebx + 1], [eax + ebx + 1], SSE4, SSE5, SSE6, SSE7, SSE3)
 __asm	add			esi,				pitch
 __asm	add			edx,				pitch
 __asm	add			eax,				pitch
 __asm	add			edi,				pitch
 __asm	dec			height
 __asm	mov			ecx,				hblocks
 __asm	jnz			middle_loop
 }
 #endif	// SMOOTH2

 class SmoothTemporalRepair : public GenericVideoFilter, public PlanarAccess
 {
 	HomogeneousChild	oclip;

 #ifdef	SMOOTH2
 	void	(*st_repair)(BYTE *dp, const BYTE *previous, const BYTE *_sp, const BYTE *next,int pitch, int hblocks, int height, int remainder);
 #else
 #define		st_repair	smooth_temporal_repair1
 #endif

 	int		height2[3], hblocks[3], remainder[3];
 	unsigned		last_frame;

 	PVideoFrame __stdcall GetFrame(int n, IScriptEnvironment* env)
 	{
 		if( ((unsigned)(n - 1) >= last_frame) ) return child->GetFrame(n, env);
 		PVideoFrame sf = child->GetFrame(n, env);
 		PVideoFrame pf = oclip->GetFrame(n - 1, env);
 		PVideoFrame of = oclip->GetFrame(n, env);
 		PVideoFrame nf = oclip->GetFrame(n + 1, env);
 		PVideoFrame df = env->NewVideoFrame(vi);
 		
 		int	i = planes;
 		do
 		{
 			BYTE* dp = GetWritePtr(df,i);
 			int	pitch = GetPitch(df, i);
 			// copy the plane from sp to dp
 			env->BitBlt(dp, pitch, GetReadPtr(sf, i), pitch, width[i], height[i]);
 			st_repair(dp, GetReadPtr(pf, i), GetReadPtr(of, i), GetReadPtr(nf, i), pitch, hblocks[i], height2[i], remainder[i]);
 		} while( --i >= 0 );
 		SSE_EMMS
 		return df;
 	}
 public:
 	SmoothTemporalRepair(PClip clip, PClip _oclip, int mode, bool grey, bool planar, IScriptEnvironment* env) : GenericVideoFilter(clip), PlanarAccess(vi), oclip(_oclip, grey, env)
 	{
 		if( vi.IsYV12() + planar == 0 ) AVSenvironment->ThrowError("TemporalRepair: only planar color spaces are supported");
 		CompareVideoInfo(vi, _oclip->GetVideoInfo(), "TemporalRepair");
 		_oclip->SetCacheHints(CACHE_RANGE, 2);
 		child->SetCacheHints(CACHE_NOTHING, 0);

 #ifdef	SMOOTH2	
 		switch( mode )
 		{
 			case 1 :
 				st_repair = smooth_temporal_repair1;
 				break;
 			case 2 :
 				st_repair = smooth_temporal_repair2;
 				break;
 			default :
 				st_repair = smooth_temporal_repair3;
 		}
 #endif	// SMOOTH2

 		if( grey ) planes = 0;

 		last_frame = vi.num_frames - 2;
 		if( (int) last_frame < 0 ) last_frame = 0;
 		
 		int	i = planes;
 		do
 		{
 			height2[i] = height[i] - 2;	
 			// unsigned	w = width[i] - 1 - 2*smooth;
 			unsigned	w = width[i] - 3;
 			hblocks[i] = w / SSE_INCREMENT;
 			remainder[i] = (w & (SSE_INCREMENT - 1)) - (SSE_INCREMENT - 1);
 		} while( --i >= 0 );

 		if( (hblocks[planes] <= 0) || (height2[planes] <= 0) ) 
 				AVSenvironment->ThrowError("TemporalRepair: the width or height of the clip is too small");
 	}
 	//~SmoothTemporalRepair(){}
 };

 #define	MAXTMODE	4	

 bool	spatial[MAXTMODE + 1] = {false, true, true, true, false };

 AVSValue __cdecl CreateTemporalRepair(AVSValue args, void* user_data, IScriptEnvironment* env)
 {
 	enum ARGS { CLIP, OCLIP, MODE, SMOOTH, GREY, PLANAR };
 	PClip	clip = args[CLIP].AsClip();
 	PClip	oclip = args[OCLIP].AsClip();
 	bool	grey = args[GREY].AsBool(false);
 	int		mode = args[MODE].AsInt(args[SMOOTH].AsInt(0));
 	if( (unsigned) mode > MAXTMODE ) env->ThrowError("TemporalRepair: illegal mode %i", mode);
 	bool	planar = args[PLANAR].AsBool(false);
 	return	spatial[mode] ? (AVSValue) new SmoothTemporalRepair(clip, oclip, mode, grey, planar, env)
 										: (AVSValue) new TemporalRepair(clip, oclip, mode, grey, planar);
 };

 #else	// MODIFYPLUGIN

 class	GenericClense : public GenericVideoFilter, public PlanarAccess
 {
 protected:
 	int	hblocks[3];
 	int	remainder[3];
 	int	incpitch[3];
 	
 public:
 	GenericClense(PClip clip, bool grey, bool planar);
 };

 GenericClense::GenericClense(PClip clip, bool grey, bool planar) : GenericVideoFilter(clip), PlanarAccess(vi, planar && grey)
 {
 	if( grey ) planes = 0;
 	int	i = planes;
 	do
 	{
 		int	w = width[i];
 		hblocks[i] = --w / (2*SSE_INCREMENT);
 		remainder[i] = (w & (2*SSE_INCREMENT - 1)) - (2*SSE_INCREMENT - 1);
 		incpitch[i] = 2*SSE_INCREMENT - width[i] + remainder[i];
 	} while( --i >= 0 );
 }

 #if	SHARPEN == 1
 // only sharpen
 #define	simplesharpen(center, min, max, reg1, reg2)\
 __asm	SSE_RMOVE	reg2,				center			\
 __asm	psubusb		max,				center			\
 __asm	psubusb		reg2,				min				\
 __asm	SSE_RMOVE	reg1,				max				\
 __asm	SSE_RMOVE	min,				reg2			\
 __asm	psubusb		max,				reg2			\
 __asm	psubusb		min,				reg1			\
 __asm	pminub		reg2,				max				\
 __asm	pminub		reg1,				min				\
 __asm	psubusb		center,				reg2			\
 __asm	paddusb		center,				reg1

 #elif	SHARPEN == 2
 #define	simplesharpen(center, min, max, reg1, reg2)\
 __asm	pminub		center,				max				\
 __asm	pmaxub		center,				min				\
 __asm	SSE_RMOVE	reg2,				center			\
 __asm	psubusb		max,				center			\
 __asm	psubusb		reg2,				min				\
 __asm	SSE_RMOVE	reg1,				max				\
 __asm	SSE_RMOVE	min,				reg2			\
 __asm	psubusb		max,				reg2			\
 __asm	psubusb		min,				reg1			\
 __asm	pminub		reg2,				max				\
 __asm	pminub		reg1,				min				\
 __asm	psubusb		center,				reg2			\
 __asm	paddusb		center,				reg1
 #endif	// SHARPEN == 2

 #if	ISSE > 1
 static inline void aligned_clense(BYTE *dp, int dpitch, const BYTE *_sp, int spitch, const BYTE *pp, int ppitch, const BYTE *np, int npitch, int hblocks, int remainder, int incpitch, int height)
 #else
 static void clense(BYTE *dp, int dpitch, const BYTE *_sp, int spitch, const BYTE *pp, int ppitch, const BYTE *np, int npitch, int hblocks, int remainder, int incpitch, int height)
 #endif
 #ifdef	SHARPEN
 #define	AClensePixel(daddr, saddr, paddr, naddr, reg1, reg2, reg3, reg4, reg5, reg6, reg7, reg8)	\
 __asm	SSE_RMOVE	reg1,			[naddr]					\
 __asm	SSE_RMOVE	reg2,			[naddr + SSE_INCREMENT]	\
 __asm	SSE_RMOVE	reg3,			reg1					\
 __asm	SSE_RMOVE	reg5,			[paddr]					\
 __asm	SSE_RMOVE	reg4,			reg2					\
 __asm	SSE_RMOVE	reg6,			[paddr + SSE_INCREMENT]	\
 __asm	pminub		reg1,			reg5					\
 __asm	pminub		reg2,			reg6					\
 __asm	SSE_RMOVE	reg7,			[saddr]					\
 __asm	pmaxub		reg3,			reg5					\
 __asm	pmaxub		reg4,			reg6					\
 __asm	SSE_RMOVE	reg8,			[saddr + SSE_INCREMENT]	\
 		simplesharpen(reg7, reg1, reg3, reg5, reg6)			\
 		simplesharpen(reg8, reg2, reg4, reg5, reg6)			\
 __asm	SSE_RMOVE	[daddr],		reg7					\
 __asm	SSE_RMOVE	[daddr + SSE_INCREMENT], reg8

 #define	UClensePixel(daddr, saddr, paddr, naddr, reg1, reg2, reg3, reg4, reg5, reg6, reg7, reg8)	\
 __asm	SSE3_MOVE	reg1,			[naddr]					\
 __asm	SSE3_MOVE	reg2,			[naddr + SSE_INCREMENT]	\
 __asm	SSE_RMOVE	reg3,			reg1					\
 __asm	SSE3_MOVE	reg5,			[paddr]					\
 __asm	SSE_RMOVE	reg4,			reg2					\
 __asm	SSE3_MOVE	reg6,			[paddr + SSE_INCREMENT]	\
 __asm	pminub		reg1,			reg5					\
 __asm	pminub		reg2,			reg6					\
 __asm	SSE3_MOVE	reg7,			[saddr]					\
 __asm	pmaxub		reg3,			reg5					\
 __asm	pmaxub		reg4,			reg6					\
 __asm	SSE3_MOVE	reg8,			[saddr + SSE_INCREMENT]	\
 		simplesharpen(reg7, reg1, reg3, reg5, reg6)			\
 		simplesharpen(reg8, reg2, reg4, reg5, reg6)			\
 __asm	SSE_MOVE	[daddr],		reg7					\
 __asm	SSE_MOVE	[daddr + SSE_INCREMENT], reg8
 #else	// SHARPEN
 #define	AClensePixel(daddr, saddr, paddr, naddr, reg1, reg2, reg3, reg4, reg5, reg6, reg7, reg8)	\
 __asm	SSE_RMOVE	reg1,			[naddr]					\
 __asm	SSE_RMOVE	reg2,			[naddr + SSE_INCREMENT]	\
 __asm	SSE_RMOVE	reg3,			reg1					\
 __asm	SSE_RMOVE	reg5,			[paddr]					\
 __asm	SSE_RMOVE	reg4,			reg2					\
 __asm	SSE_RMOVE	reg6,			[paddr + SSE_INCREMENT]	\
 __asm	pminub		reg1,			reg5					\
 __asm	pminub		reg2,			reg6					\
 __asm	pmaxub		reg1,			[saddr]					\
 __asm	pmaxub		reg3,			reg5					\
 __asm	pmaxub		reg4,			reg6					\
 __asm	pmaxub		reg2,			[saddr + SSE_INCREMENT]	\
 __asm	pminub		reg1,			reg3					\
 __asm	pminub		reg2,			reg4					\
 __asm	SSE_RMOVE	[daddr],		reg1					\
 __asm	SSE_RMOVE	[daddr + SSE_INCREMENT], reg2

 #if		ISSE > 1
 #define	UClensePixel(daddr, saddr, paddr, naddr, reg1, reg2, reg3, reg4, reg5, reg6, reg7, reg8)	\
 __asm	SSE3_MOVE	reg1,			[naddr]					\
 __asm	SSE3_MOVE	reg2,			[naddr + SSE_INCREMENT]	\
 __asm	SSE_RMOVE	reg3,			reg1					\
 __asm	SSE3_MOVE	reg5,			[paddr]					\
 __asm	SSE_RMOVE	reg4,			reg2					\
 __asm	SSE3_MOVE	reg6,			[paddr + SSE_INCREMENT]	\
 __asm	pminub		reg1,			reg5					\
 __asm	pminub		reg2,			reg6					\
 __asm	SSE3_MOVE	reg7,			[saddr]					\
 __asm	pmaxub		reg3,			reg5					\
 __asm	pmaxub		reg4,			reg6					\
 __asm	SSE3_MOVE	reg8,			[saddr + SSE_INCREMENT]	\
 __asm	pmaxub		reg1,			reg7					\
 __asm	pmaxub		reg2,			reg8					\
 __asm	pminub		reg1,			reg3					\
 __asm	pminub		reg2,			reg4					\
 __asm	SSE_MOVE	[daddr],		reg1					\
 __asm	SSE_MOVE	[daddr + SSE_INCREMENT], reg2
 #else
 #define	UClensePixel(daddr, saddr, paddr, naddr, reg1, reg2, reg3, reg4, reg5, reg6, reg7, reg8)	\
 		AClensePixel(daddr, saddr, paddr, naddr, reg1, reg2, reg3, reg4, reg5, reg6, reg7, reg8)
 #endif
 #endif	// SHARPEN
 {
 __asm	mov			eax,				incpitch
 __asm	mov			ebx,				pp
 __asm	add			dpitch,				eax
 __asm	add			spitch,				eax
 __asm	add			ppitch,				eax
 __asm	add			npitch,				eax
 __asm	mov			esi,				_sp
 __asm	mov			edi,				dp
 __asm	mov			edx,				remainder
 __asm	mov			eax,				np
 __asm	mov			ecx,				hblocks
 __asm	align		16
 __asm	_loop:
 		AClensePixel(edi, esi, ebx, eax, SSE0, SSE1, SSE2, SSE3, SSE4, SSE5, SSE6, SSE7)		 
 __asm	add			eax,				2*SSE_INCREMENT
 __asm	add			esi,				2*SSE_INCREMENT
 __asm	add			edi,				2*SSE_INCREMENT
 __asm	add			ebx,				2*SSE_INCREMENT
 #if	defined(SHARPEN) && (ISSE > 1)
 __asm	dec			ecx
 __asm	jnz			_loop
 #else
 __asm	loop		_loop
 #endif
 // the last pixels
 		UClensePixel(edi + edx, esi + edx, ebx + edx, eax + edx, SSE0, SSE1, SSE2, SSE3, SSE4, SSE5, SSE6, SSE7)
 __asm	add			esi,				spitch
 __asm	add			edi,				dpitch
 __asm	add			ebx,				ppitch
 __asm	add			eax,				npitch
 __asm	dec			height
 __asm	mov			ecx,				hblocks
 __asm	jnz			_loop
 }

 #if		ISSE > 1
 static inline void unaligned_clense(BYTE *dp, int dpitch, const BYTE *_sp, int spitch, const BYTE *pp, int ppitch, const BYTE *np, int npitch, int hblocks, int remainder, int incpitch, int height)
 {
 __asm	mov			eax,				incpitch
 __asm	mov			ebx,				pp
 __asm	add			dpitch,				eax
 __asm	add			spitch,				eax
 __asm	add			ppitch,				eax
 __asm	add			npitch,				eax
 __asm	mov			esi,				_sp
 __asm	mov			edi,				dp
 __asm	mov			edx,				remainder
 __asm	mov			eax,				np
 __asm	mov			ecx,				hblocks
 __asm	align		16
 __asm	_loop:
 		UClensePixel(edi, esi, ebx, eax, SSE0, SSE1, SSE2, SSE3, SSE4, SSE5, SSE6, SSE7)
 __asm	add			eax,				2*SSE_INCREMENT
 __asm	add			esi,				2*SSE_INCREMENT
 __asm	add			edi,				2*SSE_INCREMENT
 __asm	add			ebx,				2*SSE_INCREMENT
 #if	defined(SHARPEN) 
 __asm	dec			ecx
 __asm	jnz			_loop
 #else
 __asm	loop		_loop
 #endif
 // the last pixels
 		UClensePixel(edi + edx, esi + edx, ebx + edx, eax + edx, SSE0, SSE1, SSE2, SSE3, SSE4, SSE5, SSE6, SSE7)
 __asm	add			esi,				spitch
 __asm	add			edi,				dpitch
 __asm	add			ebx,				ppitch
 __asm	add			eax,				npitch
 __asm	dec			height
 __asm	mov			ecx,				hblocks
 __asm	jnz			_loop
 }

 static void clense(BYTE *dp, int dpitch, const BYTE *_sp, int spitch, const BYTE *pp, int ppitch, const BYTE *np, int npitch, int hblocks, int remainder, int incpitch, int height)
 {
 	if( (((unsigned)dp & (SSE_INCREMENT - 1)) + ((unsigned)_sp & (SSE_INCREMENT - 1)) + ((unsigned)pp & (SSE_INCREMENT - 1)) + ((unsigned)np & (SSE_INCREMENT - 1))
 #ifdef	ALIGNPITCH
 			+ (spitch & (SSE_INCREMENT - 1)) + (ppitch & (SSE_INCREMENT - 1)) + (npitch & (SSE_INCREMENT - 1)) 
 #endif
 			) == 0 ) aligned_clense(dp, dpitch, _sp, spitch, pp, ppitch, np, npitch, hblocks, remainder, incpitch, height);
 	else unaligned_clense(dp, dpitch, _sp, spitch, pp, ppitch, np, npitch, hblocks, remainder, incpitch, height);
 	
 }
 #endif	// ISSE > 1	

 class	Clense : public GenericClense
 {
 	PVideoFrame		lframe;
 	unsigned		lnr;
 	bool			reduceflicker;

 	PVideoFrame __stdcall GetFrame(int n, IScriptEnvironment* env)
 	{
 		if( !reduceflicker || (lnr != n-1) )
 		{
 			if( n == 0 ) return child->GetFrame(n, env);
 			lframe = child->GetFrame(n - 1, env);
 		} 
 		PVideoFrame	sf = child->GetFrame(n, env);
 		if( n >= vi.num_frames ) return sf;
 		PVideoFrame	nf = child->GetFrame(n + 1, env);
 		PVideoFrame	df = env->NewVideoFrame(vi, 2*SSE_INCREMENT);
 		int i = planes;
 		do
 		{
 			clense(GetWritePtr(df, i), GetPitch(df, i), GetReadPtr(sf, i), GetPitch(sf, i), GetReadPtr(lframe, i), GetPitch(lframe, i), GetReadPtr(nf, i), GetPitch(nf, i), hblocks[i], remainder[i], incpitch[i], height[i]);
 		} while( --i >= 0 );
 		SSE_EMMS
 		lframe = df;
 		lnr = n;
 		return df;
 	}
 public:
 	Clense(PClip clip, bool grey, bool _reduceflicker, bool planar, int cache) 
 		: GenericClense(clip, grey, planar), reduceflicker(_reduceflicker), lframe(0), lnr(-2)
 	{
 		if( cache >= 0 ) child->SetCacheHints(CACHE_RANGE, cache);
 	}

 	//~Clense(){}
 };

 AVSValue __cdecl CreateClense(AVSValue args, void* user_data, IScriptEnvironment* env)
 {
 	enum ARGS { CLIP, GREY, FLICKER, PLANAR, CACHE };
 	return new Clense(args[CLIP].AsClip(), args[GREY].AsBool(false), args[FLICKER].AsBool(true), args[PLANAR].AsBool(false), args[CACHE].AsInt(2));
 };

 class	BMCClense : public GenericClense
 {
 	PClip			pclip, nclip;

 	PVideoFrame __stdcall GetFrame(int n, IScriptEnvironment* env)
 	{
 		PVideoFrame	pf = pclip->GetFrame(n, env);
 		PVideoFrame	sf = child->GetFrame(n, env);
 		PVideoFrame	nf = nclip->GetFrame(n, env);
 		PVideoFrame	df = env->NewVideoFrame(vi, 2*SSE_INCREMENT);
 		int i = planes;
 		do
 		{
 			clense(GetWritePtr(df, i), GetPitch(df, i), GetReadPtr(sf, i), GetPitch(sf, i), GetReadPtr(pf, i), GetPitch(pf, i), GetReadPtr(nf, i), GetPitch(nf, i), hblocks[i], remainder[i], incpitch[i], height[i]);	
 		} while( --i >= 0 );
 		SSE_EMMS
 		return df;
 	}
 public:
 	BMCClense(PClip clip, PClip _pclip, PClip _nclip, bool grey, bool planar) : GenericClense(clip, grey, planar), pclip(_pclip), nclip(_nclip)
 	{
 		child->SetCacheHints(CACHE_RANGE, 0);
 		pclip->SetCacheHints(CACHE_RANGE, 0);
 		nclip->SetCacheHints(CACHE_RANGE, 0);
 		CompareVideoInfo(vi, pclip->GetVideoInfo(), "MCClense");
 		CompareVideoInfo(vi, nclip->GetVideoInfo(), "MCClense");
 	}

 	//~BMCClense(){}
 };

 AVSValue __cdecl CreateMCClense(AVSValue args, void* user_data, IScriptEnvironment* env)
 {
 	return new BMCClense(args[0].AsClip(), args[1].AsClip(), args[2].AsClip(), args[3].AsBool(false), args[4].AsBool(false));
 };

 #if	ISSE > 1
 static void aligned_sclense(BYTE *dp, int dpitch, const BYTE *_sp, int spitch, const BYTE *pp, int ppitch, const BYTE *np, int npitch, int hblocks, int remainder, int incpitch, int height)
 #else
 static void sclense(BYTE *dp, int dpitch, const BYTE *_sp, int spitch, const BYTE *pp, int ppitch, const BYTE *np, int npitch, int hblocks, int remainder, int incpitch, int height)
 #endif
 #define	ASClensePixel(daddr, saddr, paddr, naddr, reg1, reg2, reg3, reg4, reg5, reg6, reg7, reg8)	\
 __asm	SSE_RMOVE	reg1,			[paddr]					\
 __asm	SSE_RMOVE	reg2,			[paddr + SSE_INCREMENT]	\
 __asm	SSE_RMOVE	reg3,			reg1					\
 __asm	SSE_RMOVE	reg5,			[naddr]					\
 __asm	SSE_RMOVE	reg4,			reg2					\
 __asm	SSE_RMOVE	reg6,			[naddr + SSE_INCREMENT]	\
 __asm	pminub		reg1,			reg5					\
 __asm	pminub		reg2,			reg6					\
 __asm	pmaxub		reg3,			reg5					\
 __asm	pmaxub		reg4,			reg6					\
 __asm	SSE_RMOVE	reg7,			reg3					\
 __asm	SSE_RMOVE	reg8,			reg4					\
 __asm	psubusb		reg7,			reg5					\
 __asm	psubusb		reg8,			reg6					\
 __asm	psubusb		reg5,			reg1					\
 __asm	psubusb		reg6,			reg2					\
 __asm	psubusb		reg1,			reg5					\
 __asm	psubusb		reg2,			reg6					\
 __asm	pmaxub		reg1,			[saddr]					\
 __asm	paddusb		reg3,			reg7					\
 __asm	paddusb		reg4,			reg8					\
 __asm	pmaxub		reg2,			[saddr + SSE_INCREMENT]	\
 __asm	pminub		reg1,			reg3					\
 __asm	pminub		reg2,			reg4					\
 __asm	SSE_RMOVE	[daddr],		reg1					\
 __asm	SSE_RMOVE	[daddr + SSE_INCREMENT], reg2
 #if		ISSE > 1
 #define	USClensePixel(daddr, saddr, paddr, naddr, reg1, reg2, reg3, reg4, reg5, reg6, reg7, reg8)	\
 __asm	SSE3_MOVE	reg1,			[paddr]					\
 __asm	SSE3_MOVE	reg2,			[paddr + SSE_INCREMENT]	\
 __asm	SSE_RMOVE	reg3,			reg1					\
 __asm	SSE3_MOVE	reg5,			[naddr]					\
 __asm	SSE_RMOVE	reg4,			reg2					\
 __asm	SSE3_MOVE	reg6,			[naddr + SSE_INCREMENT]	\
 __asm	pminub		reg1,			reg5					\
 __asm	pminub		reg2,			reg6					\
 __asm	pmaxub		reg3,			reg5					\
 __asm	pmaxub		reg4,			reg6					\
 __asm	SSE_RMOVE	reg7,			reg3					\
 __asm	SSE_RMOVE	reg8,			reg4					\
 __asm	psubusb		reg7,			reg5					\
 __asm	psubusb		reg8,			reg6					\
 __asm	psubusb		reg5,			reg1					\
 __asm	psubusb		reg6,			reg2					\
 __asm	psubusb		reg1,			reg5					\
 __asm	psubusb		reg2,			reg6					\
 __asm	SSE3_MOVE	reg5,			[saddr]					\
 __asm	paddusb		reg3,			reg7					\
 __asm	paddusb		reg4,			reg8					\
 __asm	SSE3_MOVE	reg6,			[saddr + SSE_INCREMENT]	\
 __asm	pmaxub		reg1,			reg5					\
 __asm	pmaxub		reg2,			reg6					\
 __asm	pminub		reg1,			reg3					\
 __asm	pminub		reg2,			reg4					\
 __asm	SSE_MOVE	[daddr],		reg1					\
 __asm	SSE_MOVE	[daddr + SSE_INCREMENT], reg2
 #else
 #define	USClensePixel(daddr, saddr, paddr, naddr, reg1, reg2, reg3, reg4, reg5, reg6, reg7, reg8)	\
 		ASClensePixel(daddr, saddr, paddr, naddr, reg1, reg2, reg3, reg4, reg5, reg6, reg7, reg8)
 #endif
 {
 __asm	mov			eax,				incpitch
 __asm	mov			ebx,				pp
 __asm	add			dpitch,				eax
 __asm	add			spitch,				eax
 __asm	add			ppitch,				eax
 __asm	add			npitch,				eax
 __asm	mov			esi,				_sp
 __asm	mov			edi,				dp
 __asm	mov			edx,				remainder
 __asm	mov			eax,				np
 __asm	mov			ecx,				hblocks
 __asm	align		16
 __asm	_loop:
 		ASClensePixel(edi, esi, ebx, eax, SSE0, SSE1, SSE2, SSE3, SSE4, SSE5, SSE6, SSE7)		 
 __asm	add			eax,				2*SSE_INCREMENT
 __asm	add			esi,				2*SSE_INCREMENT
 __asm	add			edi,				2*SSE_INCREMENT
 __asm	add			ebx,				2*SSE_INCREMENT
 __asm	loop		_loop
 // the last pixels
 		USClensePixel(edi + edx, esi + edx, ebx + edx, eax + edx, SSE0, SSE1, SSE2, SSE3, SSE4, SSE5, SSE6, SSE7)
 __asm	add			esi,				spitch
 __asm	add			edi,				dpitch
 __asm	add			ebx,				ppitch
 __asm	add			eax,				npitch
 __asm	dec			height
 __asm	mov			ecx,				hblocks
 __asm	jnz			_loop
 }

 #if		ISSE > 1

 static void unaligned_sclense(BYTE *dp, int dpitch, const BYTE *_sp, int spitch, const BYTE *pp, int ppitch, const BYTE *np, int npitch, int hblocks, int remainder, int incpitch, int height)
 {
 __asm	mov			eax,				incpitch
 __asm	mov			ebx,				pp
 __asm	add			dpitch,				eax
 __asm	add			spitch,				eax
 __asm	add			ppitch,				eax
 __asm	add			npitch,				eax
 __asm	mov			esi,				_sp
 __asm	mov			edi,				dp
 __asm	mov			edx,				remainder
 __asm	mov			eax,				np
 __asm	mov			ecx,				hblocks
 __asm	align		16
 __asm	_loop:
 		USClensePixel(edi, esi, ebx, eax, SSE0, SSE1, SSE2, SSE3, SSE4, SSE5, SSE6, SSE7)		 
 __asm	add			eax,				2*SSE_INCREMENT
 __asm	add			esi,				2*SSE_INCREMENT
 __asm	add			edi,				2*SSE_INCREMENT
 __asm	add			ebx,				2*SSE_INCREMENT
 #if	ISSE > 1
 __asm	dec			ecx
 __asm	jnz			_loop
 #else
 __asm	loop		_loop
 #endif
 // the last pixels
 		USClensePixel(edi + edx, esi + edx, ebx + edx, eax + edx, SSE0, SSE1, SSE2, SSE3, SSE4, SSE5, SSE6, SSE7)
 __asm	add			esi,				spitch
 __asm	add			edi,				dpitch
 __asm	add			ebx,				ppitch
 __asm	add			eax,				npitch
 __asm	dec			height
 __asm	mov			ecx,				hblocks
 __asm	jnz			_loop
 }

 static void sclense(BYTE *dp, int dpitch, const BYTE *_sp, int spitch, const BYTE *pp, int ppitch, const BYTE *np, int npitch, int hblocks, int remainder, int incpitch, int height)
 {
 	if( (((unsigned)dp & (SSE_INCREMENT - 1)) + ((unsigned)_sp & (SSE_INCREMENT - 1)) + ((unsigned)pp & (SSE_INCREMENT - 1)) + ((unsigned)np & (SSE_INCREMENT - 1))
 #ifdef	ALIGNPITCH
 			+ (spitch & (SSE_INCREMENT - 1)) + (ppitch & (SSE_INCREMENT - 1)) + (npitch & (SSE_INCREMENT - 1)) 
 #endif
 			) == 0 ) aligned_sclense(dp, dpitch, _sp, spitch, pp, ppitch, np, npitch, hblocks, remainder, incpitch, height);
 	else unaligned_sclense(dp, dpitch, _sp, spitch, pp, ppitch, np, npitch, hblocks, remainder, incpitch, height);
 	
 }
 #endif

 class	BackwardClense : public GenericClense
 {	
 	PVideoFrame __stdcall GetFrame(int n, IScriptEnvironment* env)
 	{
 		PVideoFrame	sf = child->GetFrame(n, env);
 		if( n < 2 ) return sf;
 		
 		PVideoFrame	next1 = child->GetFrame(n - 1, env);
 		PVideoFrame	next2 = child->GetFrame(n - 2, env);
 		PVideoFrame	df = env->NewVideoFrame(vi, 2*SSE_INCREMENT);
 		int i = planes;
 		do
 		{
 			sclense(GetWritePtr(df, i), GetPitch(df, i), GetReadPtr(sf, i), GetPitch(sf, i), GetReadPtr(next1, i), GetPitch(next1, i), GetReadPtr(next2, i), GetPitch(next2, i), hblocks[i], remainder[i], incpitch[i], height[i]);
 		} while( --i >= 0 );
 		SSE_EMMS
 		return df;
 	}
 public:
 	BackwardClense(PClip clip, bool grey, bool planar, int cache) : GenericClense(clip, grey, planar)
 	{
 		if( cache >= 0 ) child->SetCacheHints(CACHE_RANGE, cache);
 	}
 };

 class	ForwardClense : public BackwardClense
 {	
 	int	lastnr;
 	PVideoFrame __stdcall GetFrame(int n, IScriptEnvironment* env)
 	{
 		PVideoFrame	sf = child->GetFrame(n, env);
 		if( n >= lastnr ) return sf;
 		
 		PVideoFrame	next1 = child->GetFrame(n + 1, env);
 		PVideoFrame	next2 = child->GetFrame(n + 2, env);
 		PVideoFrame	df = env->NewVideoFrame(vi, 2*SSE_INCREMENT);
 		int i = planes;
 		do
 		{
 			sclense(GetWritePtr(df, i), GetPitch(df, i), GetReadPtr(sf, i), GetPitch(sf, i), GetReadPtr(next1, i), GetPitch(next1, i), GetReadPtr(next2, i), GetPitch(next2, i), hblocks[i], remainder[i], incpitch[i], height[i]);
 		} while( --i >= 0 );
 		SSE_EMMS
 		return df;
 	}
 public:
 	ForwardClense(PClip clip, bool grey, bool planar, int cache) : BackwardClense(clip, grey, planar, cache), lastnr(vi.num_frames - 2)
 	{}
 };

 char	clenseargs[] ="c[grey]b[planar]b[cache]i";

 AVSValue __cdecl CreateBackwardClense(AVSValue args, void* user_data, IScriptEnvironment* env)
 {
 	enum ARGS { CLIP, GREY, PLANAR, CACHE };
 	return new BackwardClense(args[CLIP].AsClip(), args[GREY].AsBool(false), args[PLANAR].AsBool(false), args[CACHE].AsInt(2));
 };

 AVSValue __cdecl CreateForwardClense(AVSValue args, void* user_data, IScriptEnvironment* env)
 {
 	enum ARGS { CLIP, GREY, PLANAR, CACHE };
 	return new ForwardClense(args[CLIP].AsClip(), args[GREY].AsBool(false), args[PLANAR].AsBool(false), args[CACHE].AsInt(2));
 };



 #endif // MODIFYPLUGIN

 extern "C" __declspec(dllexport) const char* __stdcall AvisynthPluginInit2(IScriptEnvironment* env)
 {
 #ifdef	MODIFYPLUGIN
 #ifdef	DEBUG_NAME
 	env->AddFunction("DRepair", "cc[mode]i[modeU]i[modeV]i[planar]b", CreateRemoveGrain, 0);
 	env->AddFunction("DTemporalRepair", "cc[mode]i[smooth]i[grey]b[planar]b", CreateTemporalRepair, 0);
 #else
 	env->AddFunction("Repair", "cc[mode]i[modeU]i[modeV]i[planar]b", CreateRemoveGrain, 0);
 	env->AddFunction("TemporalRepair", "cc[mode]i[smooth]i[grey]b[planar]b", CreateTemporalRepair, 0);
 #endif
 #elif SHARPEN == 1
 #ifdef	DEBUG_NAME
 	env->AddFunction("DRSharpen", "c[mode]i[modeU]i[modeV]i[strength]i[strengthU]i[strengthV]i[planar]b", CreateRemoveGrain, 0);
 	env->AddFunction("DMotionSharpen", "c[grey]b[reduceflicker]b[planar]b[cache]i", CreateClense, 0);
 #else
 	env->AddFunction("RSharpen", "c[mode]i[modeU]i[modeV]i[strength]i[strengthU]i[strengthV]i[planar]b", CreateRemoveGrain, 0);
 	env->AddFunction("MotionSharpen", "c[grey]b[reduceflicker]b[planar]b[cache]i", CreateClense, 0);
 #endif
 #elif SHARPEN > 1
 #ifdef	DEBUG_NAME
 	env->AddFunction("DDenoiseSharpen", "c[mode]i[modeU]i[modeV]i[strength]i[strengthU]i[strengthV]i[planar]b", CreateRemoveGrain, 0);
 	env->AddFunction("DClenseSharpen", "c[grey]b[reduceflicker]b[planar]b[cache]i", CreateClense, 0);
 #else
 	env->AddFunction("DenoiseSharpen", "c[mode]i[modeU]i[modeV]i[strength]i[strengthU]i[strengthV]i[planar]b", CreateRemoveGrain, 0);
 	env->AddFunction("ClenseSharpen", "c[grey]b[reduceflicker]b[planar]b[cache]i", CreateClense, 0);
 #endif
 #elif	BLUR == 1
 #ifdef	DEBUG_NAME
 	env->AddFunction("DRBlur", "c[mode]i[modeU]i[modeV]i[planar]b", CreateRemoveGrain, 0);
 #else
 	env->AddFunction("RBlur", "c[mode]i[modeU]i[modeV]i[planar]b", CreateRemoveGrain, 0);
 #endif
 #elif	BLUR > 1
 #ifdef	DEBUG_NAME
 	env->AddFunction("DDenoiseBlur", "c[mode]i[modeU]i[modeV]i[planar]b", CreateRemoveGrain, 0);
 #else
 	env->AddFunction("DenoiseBlur", "c[mode]i[modeU]i[modeV]i[planar]b", CreateRemoveGrain, 0);
 #endif
 #else // MODIFYPLUGIN
 #ifdef	DEBUG_NAME
 	env->AddFunction("DRemoveGrain", "c[mode]i[modeU]i[modeV]i[planar]b", CreateRemoveGrain, 0);
 	env->AddFunction("DClense", "c[grey]b[reduceflicker]b[planar]b[cache]i", CreateClense, 0);
 	env->AddFunction("DMCClense", "ccc[grey]b[planar]b", CreateMCClense, 0);
 	env->AddFunction("DBackwardClense", clenseargs, CreateBackwardClense, 0);
 	env->AddFunction("DForwardClense", clenseargs, CreateForwardClense, 0);
 #else
 	env->AddFunction("RemoveGrain", "c[mode]i[modeU]i[modeV]i[planar]b", CreateRemoveGrain, 0);
 	env->AddFunction("Clense", "c[grey]b[reduceflicker]b[planar]b[cache]i", CreateClense, 0);
 	env->AddFunction("MCClense", "ccc[grey]b[planar]b", CreateMCClense, 0);
 	env->AddFunction("BackwardClense", clenseargs, CreateBackwardClense, 0);
 	env->AddFunction("ForwardClense", clenseargs, CreateForwardClense, 0);
 #endif
 #endif // MODIFYPLUGIN
 	AVSenvironment = env;
 	if( (CPUFLAGS & env->GetCPUFlags()) != CPUFLAGS ) 
 #if	ISSE > 1
 		env->ThrowError("RemoveGrain needs an SSE2 capable cpu!\n");
 #else
 		env->ThrowError("RemoveGrain needs an SSE capable cpu!\n");
 #endif
 #if	0
 	debug_printf(LOGO);
 #endif
 	return "RemoveGrain: remove grain from film";
 }