justinruggles · July 30, 2012 13:32
diff --git a/gistfile1.diff b/gistfile1.diff
 From 926c0121c9fd3b7234f175648db87bdb6e3c15e5 Mon Sep 17 00:00:00 2001
 From: Justin Ruggles <[email protected]>
 Date: Fri, 20 Jul 2012 19:53:40 -0400
 Subject: [PATCH] dsputil: x86: use cpuflags for apply_window_int16()

 ---
 libavcodec/x86/dsputil_mmx.c    |   12 +++---
 libavcodec/x86/dsputil_yasm.asm |   79 +++++++++++++++++---------------------
 2 files changed, 41 insertions(+), 50 deletions(-)

 diff --git a/libavcodec/x86/dsputil_mmx.c b/libavcodec/x86/dsputil_mmx.c
 index d9c8e96..a23e167 100644
 --- a/libavcodec/x86/dsputil_mmx.c
 +++ b/libavcodec/x86/dsputil_mmx.c
 @@ -2490,13 +2490,13 @@ int32_t ff_scalarproduct_and_madd_int16_ssse3(int16_t *v1, const int16_t *v2,
                                               const int16_t *v3,
                                               int order, int mul);
 
 -void ff_apply_window_int16_mmxext    (int16_t *output, const int16_t *input,
 +void ff_apply_window_int16_mmx2      (int16_t *output, const int16_t *input,
                                       const int16_t *window, unsigned int len);
 -void ff_apply_window_int16_mmxext_ba (int16_t *output, const int16_t *input,
 +void ff_apply_window_int16_ba_mmx2   (int16_t *output, const int16_t *input,
                                       const int16_t *window, unsigned int len);
 void ff_apply_window_int16_sse2      (int16_t *output, const int16_t *input,
                                       const int16_t *window, unsigned int len);
 -void ff_apply_window_int16_sse2_ba   (int16_t *output, const int16_t *input,
 +void ff_apply_window_int16_ba_sse2   (int16_t *output, const int16_t *input,
                                       const int16_t *window, unsigned int len);
 void ff_apply_window_int16_ssse3     (int16_t *output, const int16_t *input,
                                       const int16_t *window, unsigned int len);
 @@ -2729,9 +2729,9 @@ static void dsputil_init_mmx2(DSPContext *c, AVCodecContext *avctx,
     c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_mmx2;
 
     if (avctx->flags & CODEC_FLAG_BITEXACT) {
 -        c->apply_window_int16 = ff_apply_window_int16_mmxext_ba;
 +        c->apply_window_int16 = ff_apply_window_int16_ba_mmx2;
     } else {
 -        c->apply_window_int16 = ff_apply_window_int16_mmxext;
 +        c->apply_window_int16 = ff_apply_window_int16_mmx2;
     }
 #endif
 }
 @@ -2913,7 +2913,7 @@ static void dsputil_init_sse2(DSPContext *c, AVCodecContext *avctx,
         c->vector_clip_int32 = ff_vector_clip_int32_sse2;
     }
     if (avctx->flags & CODEC_FLAG_BITEXACT) {
 -        c->apply_window_int16 = ff_apply_window_int16_sse2_ba;
 +        c->apply_window_int16 = ff_apply_window_int16_ba_sse2;
     } else if (!(mm_flags & AV_CPU_FLAG_SSE2SLOW)) {
         c->apply_window_int16 = ff_apply_window_int16_sse2;
     }
 diff --git a/libavcodec/x86/dsputil_yasm.asm b/libavcodec/x86/dsputil_yasm.asm
 index ea07644..291e7ad 100644
 --- a/libavcodec/x86/dsputil_yasm.asm
 +++ b/libavcodec/x86/dsputil_yasm.asm
 @@ -208,24 +208,22 @@ SCALARPRODUCT_LOOP 0
 ;                            const int16_t *window, unsigned int len)
 ;-----------------------------------------------------------------------------
 
 -%macro REVERSE_WORDS_MMXEXT 1-2
 -    pshufw   %1, %1, 0x1B
 -%endmacro
 -
 -%macro REVERSE_WORDS_SSE2 1-2
 +%macro REVERSE_WORDS 1-2
 +%if cpuflag(ssse3) && notcpuflag(atom)
 +    pshufb  %1, %2
 +%elif cpuflag(sse2)
     pshuflw  %1, %1, 0x1B
     pshufhw  %1, %1, 0x1B
     pshufd   %1, %1, 0x4E
 -%endmacro
 -
 -%macro REVERSE_WORDS_SSSE3 2
 -    pshufb  %1, %2
 +%elif cpuflag(mmx2)
 +    pshufw   %1, %1, 0x1B
 +%endif
 %endmacro
 
 ; dst = (dst * src) >> 15
 ; pmulhw cuts off the bottom bit, so we have to lshift by 1 and add it back
 ; in from the pmullw result.
 -%macro MUL16FIXED_MMXEXT 3 ; dst, src, temp
 +%macro MUL16FIXED 3 ; dst, src, temp
     mova    %3, %1
     pmulhw  %1, %2
     pmullw  %3, %2
 @@ -234,22 +232,26 @@ SCALARPRODUCT_LOOP 0
     por     %1, %3
 %endmacro
 
 -; dst = ((dst * src) + (1<<14)) >> 15
 -%macro MUL16FIXED_SSSE3 3 ; dst, src, unused
 -    pmulhrsw   %1, %2
 -%endmacro
 -
 -%macro APPLY_WINDOW_INT16 3 ; %1=instruction set, %2=mmxext/sse2 bit exact version, %3=has_ssse3
 -cglobal apply_window_int16_%1, 4,5,6, output, input, window, offset, offset2
 +%macro APPLY_WINDOW_INT16 1-2 ; %1=mmxext/sse2 bit exact version, %2=suffix
 +cglobal apply_window_int16%2, 4,5,6, output, input, window, offset, offset2
     lea     offset2q, [offsetq-mmsize]
 -%if %2
 +%if cpuflag(ssse3) && notcpuflag(atom)
 +    mova          m2, [pb_revwords]
 +%elif %1
     mova          m5, [pd_16384]
 -%elifidn %1, ssse3
 -    mova          m5, [pb_revwords]
 -    ALIGN 16
 %endif
 .loop:
 -%if %2
 +%if cpuflag(ssse3)
 +    ; This version does the 16x16->16 multiplication in-place without expanding
 +    ; to 32-bit. The ssse3 version is bit-identical.
 +    mova          m0, [windowq+offset2q]
 +    mova          m1, [ inputq+offset2q]
 +    pmulhrsw      m1, m0
 +    REVERSE_WORDS m0, m2
 +    pmulhrsw      m0, [ inputq+offsetq ]
 +    mova  [outputq+offset2q], m1
 +    mova  [outputq+offsetq ], m0
 +%elif %1
     ; This version expands 16-bit to 32-bit, multiplies by the window,
     ; adds 16384 for rounding, right shifts 15, then repacks back to words to
     ; save to the output. The window is reversed for the second half.
 @@ -285,16 +287,6 @@ cglobal apply_window_int16_%1, 4,5,6, output, input, window, offset, offset2
     psrad         m2, 15
     packssdw      m0, m2
     mova  [outputq+offsetq], m0
 -%elif %3
 -    ; This version does the 16x16->16 multiplication in-place without expanding
 -    ; to 32-bit. The ssse3 version is bit-identical.
 -    mova          m0, [windowq+offset2q]
 -    mova          m1, [ inputq+offset2q]
 -    pmulhrsw      m1, m0
 -    REVERSE_WORDS m0, m5
 -    pmulhrsw      m0, [ inputq+offsetq ]
 -    mova  [outputq+offset2q], m1
 -    mova  [outputq+offsetq ], m0
 %else
     ; This version does the 16x16->16 multiplication in-place without expanding
     ; to 32-bit. The mmxext and sse2 versions do not use rounding, and
 @@ -314,21 +306,20 @@ cglobal apply_window_int16_%1, 4,5,6, output, input, window, offset, offset2
     REP_RET
 %endmacro
 
 -INIT_MMX
 -%define REVERSE_WORDS REVERSE_WORDS_MMXEXT
 -%define MUL16FIXED MUL16FIXED_MMXEXT
 -APPLY_WINDOW_INT16 mmxext,     0, 0
 -APPLY_WINDOW_INT16 mmxext_ba,  1, 0
 -INIT_XMM
 -%define REVERSE_WORDS REVERSE_WORDS_SSE2
 -APPLY_WINDOW_INT16 sse2,       0, 0
 -APPLY_WINDOW_INT16 sse2_ba,    1, 0
 -APPLY_WINDOW_INT16 ssse3_atom, 0, 1
 -%define REVERSE_WORDS REVERSE_WORDS_SSSE3
 -APPLY_WINDOW_INT16 ssse3,      0, 1
 +INIT_MMX mmx2
 +APPLY_WINDOW_INT16  0
 +APPLY_WINDOW_INT16  1, _ba
 +INIT_XMM sse2
 +APPLY_WINDOW_INT16 0
 +APPLY_WINDOW_INT16 1, _ba
 +INIT_XMM ssse3
 +APPLY_WINDOW_INT16 1
 +INIT_XMM ssse3,atom
 +APPLY_WINDOW_INT16 1
 
 
 ; void add_hfyu_median_prediction_mmx2(uint8_t *dst, const uint8_t *top, const uint8_t *diff, int w, int *left, int *left_top)
 +INIT_MMX
 cglobal add_hfyu_median_prediction_mmx2, 6,6,0, dst, top, diff, w, left, left_top
     movq    mm0, [topq]
     movq    mm2, mm0
 -- 
 1.7.1
	From 926c0121c9fd3b7234f175648db87bdb6e3c15e5 Mon Sep 17 00:00:00 2001
	From: Justin Ruggles <[email protected]>
	Date: Fri, 20 Jul 2012 19:53:40 -0400
	Subject: [PATCH] dsputil: x86: use cpuflags for apply_window_int16()

	---
	libavcodec/x86/dsputil_mmx.c \| 12 +++---
	libavcodec/x86/dsputil_yasm.asm \| 79 +++++++++++++++++---------------------
	2 files changed, 41 insertions(+), 50 deletions(-)

	diff --git a/libavcodec/x86/dsputil_mmx.c b/libavcodec/x86/dsputil_mmx.c
	index d9c8e96..a23e167 100644
	--- a/libavcodec/x86/dsputil_mmx.c
	+++ b/libavcodec/x86/dsputil_mmx.c
	@@ -2490,13 +2490,13 @@ int32_t ff_scalarproduct_and_madd_int16_ssse3(int16_t v1, const int16_t v2,
	const int16_t *v3,
	int order, int mul);

	-void ff_apply_window_int16_mmxext (int16_t output, const int16_t input,
	+void ff_apply_window_int16_mmx2 (int16_t output, const int16_t input,
	const int16_t *window, unsigned int len);
	-void ff_apply_window_int16_mmxext_ba (int16_t output, const int16_t input,
	+void ff_apply_window_int16_ba_mmx2 (int16_t output, const int16_t input,
	const int16_t *window, unsigned int len);
	void ff_apply_window_int16_sse2 (int16_t output, const int16_t input,
	const int16_t *window, unsigned int len);
	-void ff_apply_window_int16_sse2_ba (int16_t output, const int16_t input,
	+void ff_apply_window_int16_ba_sse2 (int16_t output, const int16_t input,
	const int16_t *window, unsigned int len);
	void ff_apply_window_int16_ssse3 (int16_t output, const int16_t input,
	const int16_t *window, unsigned int len);
	@@ -2729,9 +2729,9 @@ static void dsputil_init_mmx2(DSPContext c, AVCodecContext avctx,
	c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_mmx2;

	if (avctx->flags & CODEC_FLAG_BITEXACT) {
	- c->apply_window_int16 = ff_apply_window_int16_mmxext_ba;
	+ c->apply_window_int16 = ff_apply_window_int16_ba_mmx2;
	} else {
	- c->apply_window_int16 = ff_apply_window_int16_mmxext;
	+ c->apply_window_int16 = ff_apply_window_int16_mmx2;
	}
	#endif
	}
	@@ -2913,7 +2913,7 @@ static void dsputil_init_sse2(DSPContext c, AVCodecContext avctx,
	c->vector_clip_int32 = ff_vector_clip_int32_sse2;
	}
	if (avctx->flags & CODEC_FLAG_BITEXACT) {
	- c->apply_window_int16 = ff_apply_window_int16_sse2_ba;
	+ c->apply_window_int16 = ff_apply_window_int16_ba_sse2;
	} else if (!(mm_flags & AV_CPU_FLAG_SSE2SLOW)) {
	c->apply_window_int16 = ff_apply_window_int16_sse2;
	}
	diff --git a/libavcodec/x86/dsputil_yasm.asm b/libavcodec/x86/dsputil_yasm.asm
	index ea07644..291e7ad 100644
	--- a/libavcodec/x86/dsputil_yasm.asm
	+++ b/libavcodec/x86/dsputil_yasm.asm
	@@ -208,24 +208,22 @@ SCALARPRODUCT_LOOP 0
	; const int16_t *window, unsigned int len)
	;-----------------------------------------------------------------------------

	-%macro REVERSE_WORDS_MMXEXT 1-2
	- pshufw %1, %1, 0x1B
	-%endmacro
	-
	-%macro REVERSE_WORDS_SSE2 1-2
	+%macro REVERSE_WORDS 1-2
	+%if cpuflag(ssse3) && notcpuflag(atom)
	+ pshufb %1, %2
	+%elif cpuflag(sse2)
	pshuflw %1, %1, 0x1B
	pshufhw %1, %1, 0x1B
	pshufd %1, %1, 0x4E
	-%endmacro
	-
	-%macro REVERSE_WORDS_SSSE3 2
	- pshufb %1, %2
	+%elif cpuflag(mmx2)
	+ pshufw %1, %1, 0x1B
	+%endif
	%endmacro

	; dst = (dst * src) >> 15
	; pmulhw cuts off the bottom bit, so we have to lshift by 1 and add it back
	; in from the pmullw result.
	-%macro MUL16FIXED_MMXEXT 3 ; dst, src, temp
	+%macro MUL16FIXED 3 ; dst, src, temp
	mova %3, %1
	pmulhw %1, %2
	pmullw %3, %2
	@@ -234,22 +232,26 @@ SCALARPRODUCT_LOOP 0
	por %1, %3
	%endmacro

	-; dst = ((dst * src) + (1<<14)) >> 15
	-%macro MUL16FIXED_SSSE3 3 ; dst, src, unused
	- pmulhrsw %1, %2
	-%endmacro
	-
	-%macro APPLY_WINDOW_INT16 3 ; %1=instruction set, %2=mmxext/sse2 bit exact version, %3=has_ssse3
	-cglobal apply_window_int16_%1, 4,5,6, output, input, window, offset, offset2
	+%macro APPLY_WINDOW_INT16 1-2 ; %1=mmxext/sse2 bit exact version, %2=suffix
	+cglobal apply_window_int16%2, 4,5,6, output, input, window, offset, offset2
	lea offset2q, [offsetq-mmsize]
	-%if %2
	+%if cpuflag(ssse3) && notcpuflag(atom)
	+ mova m2, [pb_revwords]
	+%elif %1
	mova m5, [pd_16384]
	-%elifidn %1, ssse3
	- mova m5, [pb_revwords]
	- ALIGN 16
	%endif
	.loop:
	-%if %2
	+%if cpuflag(ssse3)
	+ ; This version does the 16x16->16 multiplication in-place without expanding
	+ ; to 32-bit. The ssse3 version is bit-identical.
	+ mova m0, [windowq+offset2q]
	+ mova m1, [ inputq+offset2q]
	+ pmulhrsw m1, m0
	+ REVERSE_WORDS m0, m2
	+ pmulhrsw m0, [ inputq+offsetq ]
	+ mova [outputq+offset2q], m1
	+ mova [outputq+offsetq ], m0
	+%elif %1
	; This version expands 16-bit to 32-bit, multiplies by the window,
	; adds 16384 for rounding, right shifts 15, then repacks back to words to
	; save to the output. The window is reversed for the second half.
	@@ -285,16 +287,6 @@ cglobal apply_window_int16_%1, 4,5,6, output, input, window, offset, offset2
	psrad m2, 15
	packssdw m0, m2
	mova [outputq+offsetq], m0
	-%elif %3
	- ; This version does the 16x16->16 multiplication in-place without expanding
	- ; to 32-bit. The ssse3 version is bit-identical.
	- mova m0, [windowq+offset2q]
	- mova m1, [ inputq+offset2q]
	- pmulhrsw m1, m0
	- REVERSE_WORDS m0, m5
	- pmulhrsw m0, [ inputq+offsetq ]
	- mova [outputq+offset2q], m1
	- mova [outputq+offsetq ], m0
	%else
	; This version does the 16x16->16 multiplication in-place without expanding
	; to 32-bit. The mmxext and sse2 versions do not use rounding, and
	@@ -314,21 +306,20 @@ cglobal apply_window_int16_%1, 4,5,6, output, input, window, offset, offset2
	REP_RET
	%endmacro

	-INIT_MMX
	-%define REVERSE_WORDS REVERSE_WORDS_MMXEXT
	-%define MUL16FIXED MUL16FIXED_MMXEXT
	-APPLY_WINDOW_INT16 mmxext, 0, 0
	-APPLY_WINDOW_INT16 mmxext_ba, 1, 0
	-INIT_XMM
	-%define REVERSE_WORDS REVERSE_WORDS_SSE2
	-APPLY_WINDOW_INT16 sse2, 0, 0
	-APPLY_WINDOW_INT16 sse2_ba, 1, 0
	-APPLY_WINDOW_INT16 ssse3_atom, 0, 1
	-%define REVERSE_WORDS REVERSE_WORDS_SSSE3
	-APPLY_WINDOW_INT16 ssse3, 0, 1
	+INIT_MMX mmx2
	+APPLY_WINDOW_INT16 0
	+APPLY_WINDOW_INT16 1, _ba
	+INIT_XMM sse2
	+APPLY_WINDOW_INT16 0
	+APPLY_WINDOW_INT16 1, _ba
	+INIT_XMM ssse3
	+APPLY_WINDOW_INT16 1
	+INIT_XMM ssse3,atom
	+APPLY_WINDOW_INT16 1


	; void add_hfyu_median_prediction_mmx2(uint8_t dst, const uint8_t top, const uint8_t diff, int w, int left, int *left_top)
	+INIT_MMX
	cglobal add_hfyu_median_prediction_mmx2, 6,6,0, dst, top, diff, w, left, left_top
	movq mm0, [topq]
	movq mm2, mm0
	--
	1.7.1