Created
July 30, 2012 13:32
-
-
Save justinruggles/3206921 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
From 926c0121c9fd3b7234f175648db87bdb6e3c15e5 Mon Sep 17 00:00:00 2001 | |
From: Justin Ruggles <[email protected]> | |
Date: Fri, 20 Jul 2012 19:53:40 -0400 | |
Subject: [PATCH] dsputil: x86: use cpuflags for apply_window_int16() | |
--- | |
libavcodec/x86/dsputil_mmx.c | 12 +++--- | |
libavcodec/x86/dsputil_yasm.asm | 79 +++++++++++++++++--------------------- | |
2 files changed, 41 insertions(+), 50 deletions(-) | |
diff --git a/libavcodec/x86/dsputil_mmx.c b/libavcodec/x86/dsputil_mmx.c | |
index d9c8e96..a23e167 100644 | |
--- a/libavcodec/x86/dsputil_mmx.c | |
+++ b/libavcodec/x86/dsputil_mmx.c | |
@@ -2490,13 +2490,13 @@ int32_t ff_scalarproduct_and_madd_int16_ssse3(int16_t *v1, const int16_t *v2, | |
const int16_t *v3, | |
int order, int mul); | |
-void ff_apply_window_int16_mmxext (int16_t *output, const int16_t *input, | |
+void ff_apply_window_int16_mmx2 (int16_t *output, const int16_t *input, | |
const int16_t *window, unsigned int len); | |
-void ff_apply_window_int16_mmxext_ba (int16_t *output, const int16_t *input, | |
+void ff_apply_window_int16_ba_mmx2 (int16_t *output, const int16_t *input, | |
const int16_t *window, unsigned int len); | |
void ff_apply_window_int16_sse2 (int16_t *output, const int16_t *input, | |
const int16_t *window, unsigned int len); | |
-void ff_apply_window_int16_sse2_ba (int16_t *output, const int16_t *input, | |
+void ff_apply_window_int16_ba_sse2 (int16_t *output, const int16_t *input, | |
const int16_t *window, unsigned int len); | |
void ff_apply_window_int16_ssse3 (int16_t *output, const int16_t *input, | |
const int16_t *window, unsigned int len); | |
@@ -2729,9 +2729,9 @@ static void dsputil_init_mmx2(DSPContext *c, AVCodecContext *avctx, | |
c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_mmx2; | |
if (avctx->flags & CODEC_FLAG_BITEXACT) { | |
- c->apply_window_int16 = ff_apply_window_int16_mmxext_ba; | |
+ c->apply_window_int16 = ff_apply_window_int16_ba_mmx2; | |
} else { | |
- c->apply_window_int16 = ff_apply_window_int16_mmxext; | |
+ c->apply_window_int16 = ff_apply_window_int16_mmx2; | |
} | |
#endif | |
} | |
@@ -2913,7 +2913,7 @@ static void dsputil_init_sse2(DSPContext *c, AVCodecContext *avctx, | |
c->vector_clip_int32 = ff_vector_clip_int32_sse2; | |
} | |
if (avctx->flags & CODEC_FLAG_BITEXACT) { | |
- c->apply_window_int16 = ff_apply_window_int16_sse2_ba; | |
+ c->apply_window_int16 = ff_apply_window_int16_ba_sse2; | |
} else if (!(mm_flags & AV_CPU_FLAG_SSE2SLOW)) { | |
c->apply_window_int16 = ff_apply_window_int16_sse2; | |
} | |
diff --git a/libavcodec/x86/dsputil_yasm.asm b/libavcodec/x86/dsputil_yasm.asm | |
index ea07644..291e7ad 100644 | |
--- a/libavcodec/x86/dsputil_yasm.asm | |
+++ b/libavcodec/x86/dsputil_yasm.asm | |
@@ -208,24 +208,22 @@ SCALARPRODUCT_LOOP 0 | |
; const int16_t *window, unsigned int len) | |
;----------------------------------------------------------------------------- | |
-%macro REVERSE_WORDS_MMXEXT 1-2 | |
- pshufw %1, %1, 0x1B | |
-%endmacro | |
- | |
-%macro REVERSE_WORDS_SSE2 1-2 | |
+%macro REVERSE_WORDS 1-2 | |
+%if cpuflag(ssse3) && notcpuflag(atom) | |
+ pshufb %1, %2 | |
+%elif cpuflag(sse2) | |
pshuflw %1, %1, 0x1B | |
pshufhw %1, %1, 0x1B | |
pshufd %1, %1, 0x4E | |
-%endmacro | |
- | |
-%macro REVERSE_WORDS_SSSE3 2 | |
- pshufb %1, %2 | |
+%elif cpuflag(mmx2) | |
+ pshufw %1, %1, 0x1B | |
+%endif | |
%endmacro | |
; dst = (dst * src) >> 15 | |
; pmulhw cuts off the bottom bit, so we have to lshift by 1 and add it back | |
; in from the pmullw result. | |
-%macro MUL16FIXED_MMXEXT 3 ; dst, src, temp | |
+%macro MUL16FIXED 3 ; dst, src, temp | |
mova %3, %1 | |
pmulhw %1, %2 | |
pmullw %3, %2 | |
@@ -234,22 +232,26 @@ SCALARPRODUCT_LOOP 0 | |
por %1, %3 | |
%endmacro | |
-; dst = ((dst * src) + (1<<14)) >> 15 | |
-%macro MUL16FIXED_SSSE3 3 ; dst, src, unused | |
- pmulhrsw %1, %2 | |
-%endmacro | |
- | |
-%macro APPLY_WINDOW_INT16 3 ; %1=instruction set, %2=mmxext/sse2 bit exact version, %3=has_ssse3 | |
-cglobal apply_window_int16_%1, 4,5,6, output, input, window, offset, offset2 | |
+%macro APPLY_WINDOW_INT16 1-2 ; %1=mmxext/sse2 bit exact version, %2=suffix | |
+cglobal apply_window_int16%2, 4,5,6, output, input, window, offset, offset2 | |
lea offset2q, [offsetq-mmsize] | |
-%if %2 | |
+%if cpuflag(ssse3) && notcpuflag(atom) | |
+ mova m2, [pb_revwords] | |
+%elif %1 | |
mova m5, [pd_16384] | |
-%elifidn %1, ssse3 | |
- mova m5, [pb_revwords] | |
- ALIGN 16 | |
%endif | |
.loop: | |
-%if %2 | |
+%if cpuflag(ssse3) | |
+ ; This version does the 16x16->16 multiplication in-place without expanding | |
+ ; to 32-bit. The ssse3 version is bit-identical. | |
+ mova m0, [windowq+offset2q] | |
+ mova m1, [ inputq+offset2q] | |
+ pmulhrsw m1, m0 | |
+ REVERSE_WORDS m0, m2 | |
+ pmulhrsw m0, [ inputq+offsetq ] | |
+ mova [outputq+offset2q], m1 | |
+ mova [outputq+offsetq ], m0 | |
+%elif %1 | |
; This version expands 16-bit to 32-bit, multiplies by the window, | |
; adds 16384 for rounding, right shifts 15, then repacks back to words to | |
; save to the output. The window is reversed for the second half. | |
@@ -285,16 +287,6 @@ cglobal apply_window_int16_%1, 4,5,6, output, input, window, offset, offset2 | |
psrad m2, 15 | |
packssdw m0, m2 | |
mova [outputq+offsetq], m0 | |
-%elif %3 | |
- ; This version does the 16x16->16 multiplication in-place without expanding | |
- ; to 32-bit. The ssse3 version is bit-identical. | |
- mova m0, [windowq+offset2q] | |
- mova m1, [ inputq+offset2q] | |
- pmulhrsw m1, m0 | |
- REVERSE_WORDS m0, m5 | |
- pmulhrsw m0, [ inputq+offsetq ] | |
- mova [outputq+offset2q], m1 | |
- mova [outputq+offsetq ], m0 | |
%else | |
; This version does the 16x16->16 multiplication in-place without expanding | |
; to 32-bit. The mmxext and sse2 versions do not use rounding, and | |
@@ -314,21 +306,20 @@ cglobal apply_window_int16_%1, 4,5,6, output, input, window, offset, offset2 | |
REP_RET | |
%endmacro | |
-INIT_MMX | |
-%define REVERSE_WORDS REVERSE_WORDS_MMXEXT | |
-%define MUL16FIXED MUL16FIXED_MMXEXT | |
-APPLY_WINDOW_INT16 mmxext, 0, 0 | |
-APPLY_WINDOW_INT16 mmxext_ba, 1, 0 | |
-INIT_XMM | |
-%define REVERSE_WORDS REVERSE_WORDS_SSE2 | |
-APPLY_WINDOW_INT16 sse2, 0, 0 | |
-APPLY_WINDOW_INT16 sse2_ba, 1, 0 | |
-APPLY_WINDOW_INT16 ssse3_atom, 0, 1 | |
-%define REVERSE_WORDS REVERSE_WORDS_SSSE3 | |
-APPLY_WINDOW_INT16 ssse3, 0, 1 | |
+INIT_MMX mmx2 | |
+APPLY_WINDOW_INT16 0 | |
+APPLY_WINDOW_INT16 1, _ba | |
+INIT_XMM sse2 | |
+APPLY_WINDOW_INT16 0 | |
+APPLY_WINDOW_INT16 1, _ba | |
+INIT_XMM ssse3 | |
+APPLY_WINDOW_INT16 1 | |
+INIT_XMM ssse3,atom | |
+APPLY_WINDOW_INT16 1 | |
; void add_hfyu_median_prediction_mmx2(uint8_t *dst, const uint8_t *top, const uint8_t *diff, int w, int *left, int *left_top) | |
+INIT_MMX | |
cglobal add_hfyu_median_prediction_mmx2, 6,6,0, dst, top, diff, w, left, left_top | |
movq mm0, [topq] | |
movq mm2, mm0 | |
-- | |
1.7.1 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment