[XviD-devel] [PATCH] calc_cbp_sse2 optimization
Mat Hostetter
mat at curl.com
Mon Apr 19 23:37:48 CEST 2004
>>>>> "mat" == Mat Hostetter <mat at curl.com> writes:
>>>>> "syskin" == Radek Czyz <syskin at ihug.com.au> writes:
syskin> Mat Hostetter wrote:
>>> This change (against 1.0.0-rc4) speeds up calc_cbp_sse2 from 131
>>> cycles to 112 cycles on the Pentium 4 (for the in-cache case).
syskin> The most unbelivable thing appears to have happened: it's
syskin> b0rked...
mat> I'll figure out a fix.
OK, I fixed it. I tested this even more thoroughly.
Hopefully my credibility's not shot too bad. :-)
I've appended a diff against 1.0.0-rc4 that incorporates all
improvements and fixes so far.
This speeds up calc_cbp_sse2 from 131 cycles to 102. I know cbp
performance doesn't matter much, but hey, I'm just getting my feet
wet, and the source file is now 30% smaller too. :-)
-Mat
--- src/bitstream/x86_asm/cbp_sse2.asm~ 2004-04-04 16:35:53.000000000 -0400
+++ src/bitstream/x86_asm/cbp_sse2.asm 2004-04-19 17:21:35.000000000 -0400
@@ -41,7 +41,7 @@
%macro LOOP_SSE2 1
movdqa xmm0, [edx+(%1)*128]
- pand xmm0, xmm7
+ pshuflw xmm0, xmm0, 11100101b ; overwrite ignored DC coeff with an AC coeff
movdqa xmm1, [edx+(%1)*128+16]
por xmm0, [edx+(%1)*128+32]
@@ -52,27 +52,17 @@
por xmm1, [edx+(%1)*128+112]
por xmm0, xmm1 ; xmm0 = xmm1 = 128 bits worth of info
- psadbw xmm0, xmm6 ; contains 2 dwords with sums
- movhlps xmm1, xmm0 ; move high dword from xmm0 to low xmm1
- por xmm0, xmm1 ; combine
- movd ecx, xmm0 ; if ecx set, values were found
- test ecx, ecx
-%endmacro
-
-;=============================================================================
-; Data (Read Only)
-;=============================================================================
-
-%ifdef FORMAT_COFF
-SECTION .rodata data
-%else
-SECTION .rodata data align=16
-%endif
-ALIGN 16
-ignore_dc:
- dw 0, -1, -1, -1, -1, -1, -1, -1
+ pcmpeqb xmm0, xmm7 ; each 0 byte becomes 0xFF, else 0.
+%endmacro
+%macro LOOP_SSE2_2 1
+ LOOP_SSE2 %1
+ pmovmskb ecx, xmm0 ; 0xFFFF iff input all zeros, else 0 <= ecx < 0xFFFF
+ cmp ecx, 0xFFFF ; set carry iff input not all zeros
+ rcl eax, 1 ; shift in carry bit. faster than "adc eax, eax"
+%endmacro
+
;=============================================================================
; Code
;=============================================================================
@@ -87,45 +77,17 @@
cglobal calc_cbp_sse2
calc_cbp_sse2:
mov edx, [esp+4] ; coeff[]
- xor eax, eax ; cbp = 0
-
- movdqu xmm7, [ignore_dc] ; mask to ignore dc value
- pxor xmm6, xmm6 ; zero
-
+ pxor xmm7, xmm7 ; zero
+
LOOP_SSE2 0
- test ecx, ecx
- jz .blk2
- or eax, (1<<5)
-
-.blk2
- LOOP_SSE2 1
- test ecx, ecx
- jz .blk3
- or eax, (1<<4)
-
-.blk3
- LOOP_SSE2 2
- test ecx, ecx
- jz .blk4
- or eax, (1<<3)
-
-.blk4
- LOOP_SSE2 3
- test ecx, ecx
- jz .blk5
- or eax, (1<<2)
-
-.blk5
- LOOP_SSE2 4
- test ecx, ecx
- jz .blk6
- or eax, (1<<1)
-
-.blk6
- LOOP_SSE2 5
- test ecx, ecx
- jz .finished
- or eax, (1<<0)
+ pmovmskb eax, xmm0 ; 0xFFFF iff input all zeros, else 0 <= eax < 0xFFFF
+ sub eax, 0xFFFF ; negative iff input not all zeros
+ shr eax, 31 ; zero if input all zeros, else one
+
+ LOOP_SSE2_2 1
+ LOOP_SSE2_2 2
+ LOOP_SSE2_2 3
+ LOOP_SSE2_2 4
+ LOOP_SSE2_2 5
-.finished
- ret
+ ret
More information about the XviD-devel
mailing list