[XviD-devel] [PATCH] calc_cbp_sse2 optimization
Mat Hostetter
mat at curl.com
Sun Apr 18 16:48:54 CEST 2004
This change (against 1.0.0-rc4) speeds up calc_cbp_sse2 from 131
cycles to 112 cycles on the Pentium 4 (for the in-cache case).
This change uses pcmpgtb/pmovmskb to extract a zero/nonzero mask,
rather than the longer sequence used previously, and eliminates all
conditional branches. I also changed a movdqu to movdqa; if there's a
reason the load might be unaligned (some bogus platform that can't
align static arrays mod 16?) please let me know.
I am new to XviD so I don't know what your standard practice is for
correctness validation and benchmarking. So I wrote my own test and
benchmark for this proc. My test tries the 258048 coeff[] arrays I
consider "interesting" with both calc_cbp_sse2 and calc_cbp_plain.
I always get the same result so I am pretty confident this patch is
correct.
I chose calc_cbp_sse2 at random just to get a feel for XviD's sources.
If someone can point me at some more important code you'd like
optimized, and tell me how you benchmark it, I may be able to
contribute (I'm a professional compiler programmer). I'm sure
you've done lots of optimizations already but another pair of eyes
never hurts. :-)
-Mat
--- src/bitstream/x86_asm/cbp_sse2.asm~ 2004-04-04 16:35:53.000000000 -0400
+++ src/bitstream/x86_asm/cbp_sse2.asm 2004-04-18 10:37:02.000000000 -0400
@@ -52,13 +52,17 @@
por xmm1, [edx+(%1)*128+112]
por xmm0, xmm1 ; xmm0 = xmm1 = 128 bits worth of info
- psadbw xmm0, xmm6 ; contains 2 dwords with sums
- movhlps xmm1, xmm0 ; move high dword from xmm0 to low xmm1
- por xmm0, xmm1 ; combine
- movd ecx, xmm0 ; if ecx set, values were found
- test ecx, ecx
+
+ pcmpgtb xmm0, xmm6
%endmacro
+%macro LOOP_SSE2_2 1
+ LOOP_SSE2 %1
+ pmovmskb ecx, xmm0
+ add ecx, 0xFFFFFFFF
+ adc eax, eax
+%endmacro
+
;=============================================================================
; Data (Read Only)
;=============================================================================
@@ -87,45 +91,18 @@
cglobal calc_cbp_sse2
calc_cbp_sse2:
mov edx, [esp+4] ; coeff[]
- xor eax, eax ; cbp = 0
-
- movdqu xmm7, [ignore_dc] ; mask to ignore dc value
+ movdqa xmm7, [ignore_dc] ; mask to ignore dc value
pxor xmm6, xmm6 ; zero
LOOP_SSE2 0
- test ecx, ecx
- jz .blk2
- or eax, (1<<5)
-
-.blk2
- LOOP_SSE2 1
- test ecx, ecx
- jz .blk3
- or eax, (1<<4)
-
-.blk3
- LOOP_SSE2 2
- test ecx, ecx
- jz .blk4
- or eax, (1<<3)
-
-.blk4
- LOOP_SSE2 3
- test ecx, ecx
- jz .blk5
- or eax, (1<<2)
-
-.blk5
- LOOP_SSE2 4
- test ecx, ecx
- jz .blk6
- or eax, (1<<1)
-
-.blk6
- LOOP_SSE2 5
- test ecx, ecx
- jz .finished
- or eax, (1<<0)
+ pmovmskb eax, xmm0
+ add eax, 0x7FFFFFFF
+ shr eax, 31
+
+ LOOP_SSE2_2 1
+ LOOP_SSE2_2 2
+ LOOP_SSE2_2 3
+ LOOP_SSE2_2 4
+ LOOP_SSE2_2 5
-.finished
- ret
+ ret
More information about the XviD-devel
mailing list