Created
July 21, 2012 21:05
-
-
Save justinruggles/3157187 to your computer and use it in GitHub Desktop.
CLIPD
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
From ac7c4b9b628f14f9bd8f457a9d15b0fee31e44bb Mon Sep 17 00:00:00 2001 | |
From: Justin Ruggles <[email protected]> | |
Date: Fri, 20 Jul 2012 15:24:40 -0400 | |
Subject: [PATCH 1/4] dsputil: x86: convert PMINSD, PMAXSD, and CLIPD macros to use cpuflags | |
--- | |
libavcodec/x86/dsputil_mmx.c | 6 ++-- | |
libavcodec/x86/dsputil_yasm.asm | 67 +++++++++++++++++++-------------------- | |
libavutil/x86/x86util.asm | 34 ++++++++++++------- | |
3 files changed, 57 insertions(+), 50 deletions(-) | |
diff --git a/libavcodec/x86/dsputil_mmx.c b/libavcodec/x86/dsputil_mmx.c | |
index 5eb4a24..d9c8e96 100644 | |
--- a/libavcodec/x86/dsputil_mmx.c | |
+++ b/libavcodec/x86/dsputil_mmx.c | |
@@ -2530,8 +2530,8 @@ void ff_vector_clip_int32_mmx (int32_t *dst, const int32_t *src, | |
int32_t min, int32_t max, unsigned int len); | |
void ff_vector_clip_int32_sse2 (int32_t *dst, const int32_t *src, | |
int32_t min, int32_t max, unsigned int len); | |
-void ff_vector_clip_int32_int_sse2(int32_t *dst, const int32_t *src, | |
- int32_t min, int32_t max, unsigned int len); | |
+void ff_vector_clip_int32_sse2_atom(int32_t *dst, const int32_t *src, | |
+ int32_t min, int32_t max, unsigned int len); | |
void ff_vector_clip_int32_sse4 (int32_t *dst, const int32_t *src, | |
int32_t min, int32_t max, unsigned int len); | |
@@ -2908,7 +2908,7 @@ static void dsputil_init_sse2(DSPContext *c, AVCodecContext *avctx, | |
c->scalarproduct_int16 = ff_scalarproduct_int16_sse2; | |
c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_sse2; | |
if (mm_flags & AV_CPU_FLAG_ATOM) { | |
- c->vector_clip_int32 = ff_vector_clip_int32_int_sse2; | |
+ c->vector_clip_int32 = ff_vector_clip_int32_sse2_atom; | |
} else { | |
c->vector_clip_int32 = ff_vector_clip_int32_sse2; | |
} | |
diff --git a/libavcodec/x86/dsputil_yasm.asm b/libavcodec/x86/dsputil_yasm.asm | |
index 313e774..6fa2ef2 100644 | |
--- a/libavcodec/x86/dsputil_yasm.asm | |
+++ b/libavcodec/x86/dsputil_yasm.asm | |
@@ -1054,50 +1054,50 @@ emu_edge mmx | |
; int32_t max, unsigned int len) | |
;----------------------------------------------------------------------------- | |
-; %1 = number of xmm registers used | |
-; %2 = number of inline load/process/store loops per asm loop | |
-; %3 = process 4*mmsize (%3=0) or 8*mmsize (%3=1) bytes per loop | |
-; %4 = CLIPD function takes min/max as float instead of int (CLIPD_SSE2) | |
-; %5 = suffix | |
-%macro VECTOR_CLIP_INT32 4-5 | |
-cglobal vector_clip_int32%5, 5,5,%1, dst, src, min, max, len | |
-%if %4 | |
- cvtsi2ss m4, minm | |
- cvtsi2ss m5, maxm | |
+; %1 = number of inline load/process/store loops per asm loop | |
+; %2 = process 4*mmsize (%3=0) or 8*mmsize (%3=1) bytes per loop | |
+ | |
+%macro VECTOR_CLIP_INT32 2 | |
+cglobal vector_clip_int32, 5,5,11, dst, src, min, max, len | |
+%if notcpuflag(sse4) && cpuflag(sse2) && notcpuflag(atom) | |
+ cvtsi2ss m4, minm | |
+ cvtsi2ss m5, maxm | |
+ %assign is_float 1 | |
%else | |
movd m4, minm | |
movd m5, maxm | |
+ %assign is_float 0 | |
%endif | |
SPLATD m4 | |
SPLATD m5 | |
.loop: | |
%assign %%i 1 | |
-%rep %2 | |
+%rep %1 | |
mova m0, [srcq+mmsize*0*%%i] | |
mova m1, [srcq+mmsize*1*%%i] | |
mova m2, [srcq+mmsize*2*%%i] | |
mova m3, [srcq+mmsize*3*%%i] | |
-%if %3 | |
+%if %2 | |
mova m7, [srcq+mmsize*4*%%i] | |
mova m8, [srcq+mmsize*5*%%i] | |
mova m9, [srcq+mmsize*6*%%i] | |
mova m10, [srcq+mmsize*7*%%i] | |
%endif | |
- CLIPD m0, m4, m5, m6 | |
- CLIPD m1, m4, m5, m6 | |
- CLIPD m2, m4, m5, m6 | |
- CLIPD m3, m4, m5, m6 | |
-%if %3 | |
- CLIPD m7, m4, m5, m6 | |
- CLIPD m8, m4, m5, m6 | |
- CLIPD m9, m4, m5, m6 | |
- CLIPD m10, m4, m5, m6 | |
+ CLIPD m0, m4, m5, is_float, m6 | |
+ CLIPD m1, m4, m5, is_float, m6 | |
+ CLIPD m2, m4, m5, is_float, m6 | |
+ CLIPD m3, m4, m5, is_float, m6 | |
+%if %2 | |
+ CLIPD m7, m4, m5, is_float, m6 | |
+ CLIPD m8, m4, m5, is_float, m6 | |
+ CLIPD m9, m4, m5, is_float, m6 | |
+ CLIPD m10, m4, m5, is_float, m6 | |
%endif | |
mova [dstq+mmsize*0*%%i], m0 | |
mova [dstq+mmsize*1*%%i], m1 | |
mova [dstq+mmsize*2*%%i], m2 | |
mova [dstq+mmsize*3*%%i], m3 | |
-%if %3 | |
+%if %2 | |
mova [dstq+mmsize*4*%%i], m7 | |
mova [dstq+mmsize*5*%%i], m8 | |
mova [dstq+mmsize*6*%%i], m9 | |
@@ -1105,28 +1105,27 @@ cglobal vector_clip_int32%5, 5,5,%1, dst, src, min, max, len | |
%endif | |
%assign %%i %%i+1 | |
%endrep | |
- add srcq, mmsize*4*(%2+%3) | |
- add dstq, mmsize*4*(%2+%3) | |
- sub lend, mmsize*(%2+%3) | |
+ add srcq, mmsize*4*(%1+%2) | |
+ add dstq, mmsize*4*(%1+%2) | |
+ sub lend, mmsize*(%1+%2) | |
jg .loop | |
REP_RET | |
+ %undef CLIPD | |
%endmacro | |
INIT_MMX mmx | |
%define SPLATD SPLATD_MMX | |
-%define CLIPD CLIPD_MMX | |
-VECTOR_CLIP_INT32 0, 1, 0, 0 | |
-INIT_XMM sse2 | |
+VECTOR_CLIP_INT32 1, 0 | |
+INIT_XMM sse2,atom | |
%define SPLATD SPLATD_SSE2 | |
-VECTOR_CLIP_INT32 6, 1, 0, 0, _int | |
-%define CLIPD CLIPD_SSE2 | |
-VECTOR_CLIP_INT32 6, 2, 0, 1 | |
+VECTOR_CLIP_INT32 1, 0 | |
+INIT_XMM sse2 | |
+VECTOR_CLIP_INT32 2, 0 | |
INIT_XMM sse4 | |
-%define CLIPD CLIPD_SSE41 | |
%ifdef m8 | |
-VECTOR_CLIP_INT32 11, 1, 1, 0 | |
+VECTOR_CLIP_INT32 1, 1 | |
%else | |
-VECTOR_CLIP_INT32 6, 1, 0, 0 | |
+VECTOR_CLIP_INT32 1, 0 | |
%endif | |
;----------------------------------------------------------------------------- | |
diff --git a/libavutil/x86/x86util.asm b/libavutil/x86/x86util.asm | |
index 941ec76..447bde4 100644 | |
--- a/libavutil/x86/x86util.asm | |
+++ b/libavutil/x86/x86util.asm | |
@@ -584,37 +584,45 @@ | |
pminsw %1, %3 | |
%endmacro | |
-%macro PMINSD_MMX 3 ; dst, src, tmp | |
+%macro PMINSD 2-3 ; dst, src, tmp | |
+%if cpuflag(sse4) && mmsize >= 16 | |
+ pminsd %1, %2 | |
+%else | |
mova %3, %2 | |
pcmpgtd %3, %1 | |
pxor %1, %2 | |
pand %1, %3 | |
pxor %1, %2 | |
+%endif | |
%endmacro | |
-%macro PMAXSD_MMX 3 ; dst, src, tmp | |
+%macro PMAXSD 2-3 ; dst, src, tmp | |
+%if cpuflag(sse4) && mmsize >= 16 | |
+ pmaxsd %1, %2 | |
+%else | |
mova %3, %1 | |
pcmpgtd %3, %2 | |
pand %1, %3 | |
pandn %3, %2 | |
por %1, %3 | |
+%endif | |
%endmacro | |
-%macro CLIPD_MMX 3-4 ; src/dst, min, max, tmp | |
- PMINSD_MMX %1, %3, %4 | |
- PMAXSD_MMX %1, %2, %4 | |
-%endmacro | |
- | |
-%macro CLIPD_SSE2 3-4 ; src/dst, min (float), max (float), unused | |
+; %1 = src/dst | |
+; %2 = min | |
+; %3 = max | |
+; %4 = min/max format: 0=dwords, 1=floats (requires SSE2) | |
+; %5 = tmp | |
+%macro CLIPD 3-5 0, 0 | |
+%if %4 == 1 | |
cvtdq2ps %1, %1 | |
minps %1, %3 | |
maxps %1, %2 | |
cvtps2dq %1, %1 | |
-%endmacro | |
- | |
-%macro CLIPD_SSE41 3-4 ; src/dst, min, max, unused | |
- pminsd %1, %3 | |
- pmaxsd %1, %2 | |
+%else | |
+ PMINSD %1, %3, %5 | |
+ PMAXSD %1, %2, %5 | |
+%endif | |
%endmacro | |
%macro VBROADCASTSS 2 ; dst xmm/ymm, src m32 | |
-- | |
1.7.1 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment