[XviD-devel] [PATCH] calc_cbp_sse2 optimization

Mon Apr 19 23:37:48 CEST 2004

>>>>> "mat" == Mat Hostetter <mat at curl.com> writes:

>>>>> "syskin" == Radek Czyz <syskin at ihug.com.au> writes:
 syskin> Mat Hostetter wrote:
 >>> This change (against 1.0.0-rc4) speeds up calc_cbp_sse2 from 131
 >>> cycles to 112 cycles on the Pentium 4 (for the in-cache case).

 syskin> The most unbelivable thing appears to have happened: it's
 syskin> b0rked...

 mat> I'll figure out a fix.

OK, I fixed it.  I tested this even more thoroughly.
Hopefully my credibility's not shot too bad.  :-)

I've appended a diff against 1.0.0-rc4 that incorporates all
improvements and fixes so far.

This speeds up calc_cbp_sse2 from 131 cycles to 102.  I know cbp
performance doesn't matter much, but hey, I'm just getting my feet
wet, and the source file is now 30% smaller too.  :-)

-Mat

--- src/bitstream/x86_asm/cbp_sse2.asm~	2004-04-04 16:35:53.000000000 -0400
+++ src/bitstream/x86_asm/cbp_sse2.asm	2004-04-19 17:21:35.000000000 -0400
@@ -41,7 +41,7 @@
 
 %macro LOOP_SSE2 1
   movdqa xmm0, [edx+(%1)*128]
-  pand xmm0, xmm7
+  pshuflw xmm0, xmm0, 11100101b   ; overwrite ignored DC coeff with an AC coeff
   movdqa xmm1, [edx+(%1)*128+16]
 
   por xmm0, [edx+(%1)*128+32]
@@ -52,27 +52,17 @@
   por xmm1, [edx+(%1)*128+112]
 
   por xmm0, xmm1        ; xmm0 = xmm1 = 128 bits worth of info
-  psadbw xmm0, xmm6     ; contains 2 dwords with sums
-  movhlps xmm1, xmm0    ; move high dword from xmm0 to low xmm1
-  por xmm0, xmm1        ; combine
-  movd ecx, xmm0        ; if ecx set, values were found
-  test ecx, ecx
-%endmacro
-
-;=============================================================================
-; Data (Read Only)
-;=============================================================================
-
-%ifdef FORMAT_COFF
-SECTION .rodata data
-%else
-SECTION .rodata data align=16
-%endif
 
-ALIGN 16
-ignore_dc:
-  dw 0, -1, -1, -1, -1, -1, -1, -1
+  pcmpeqb xmm0, xmm7    ; each 0 byte becomes 0xFF, else 0.
+%endmacro
 
+%macro LOOP_SSE2_2 1
+  LOOP_SSE2 %1
+  pmovmskb ecx, xmm0    ; 0xFFFF iff input all zeros, else 0 <= ecx < 0xFFFF
+  cmp ecx, 0xFFFF       ; set carry iff input not all zeros
+  rcl eax, 1            ; shift in carry bit.  faster than "adc eax, eax"
+%endmacro        
+        
 ;=============================================================================
 ; Code
 ;=============================================================================
@@ -87,45 +77,17 @@
 cglobal calc_cbp_sse2
 calc_cbp_sse2:
   mov edx, [esp+4]         ; coeff[]
-  xor eax, eax             ; cbp = 0
-
-  movdqu xmm7, [ignore_dc] ; mask to ignore dc value
-  pxor xmm6, xmm6          ; zero
-
+  pxor xmm7, xmm7          ; zero
+                
   LOOP_SSE2 0
-  test ecx, ecx
-  jz .blk2
-  or eax, (1<<5)
-
-.blk2
-  LOOP_SSE2 1
-  test ecx, ecx
-  jz .blk3
-  or eax, (1<<4)
-
-.blk3
-  LOOP_SSE2 2
-  test ecx, ecx
-  jz .blk4
-  or eax, (1<<3)
-
-.blk4
-  LOOP_SSE2 3
-  test ecx, ecx
-  jz .blk5
-  or eax, (1<<2)
-
-.blk5
-  LOOP_SSE2 4
-  test ecx, ecx
-  jz .blk6
-  or eax, (1<<1)
-
-.blk6
-  LOOP_SSE2 5
-  test ecx, ecx
-  jz .finished
-  or eax, (1<<0)
+  pmovmskb eax, xmm0    ; 0xFFFF iff input all zeros, else 0 <= eax < 0xFFFF
+  sub eax, 0xFFFF       ; negative iff input not all zeros
+  shr eax, 31           ; zero if input all zeros, else one
+
+  LOOP_SSE2_2 1
+  LOOP_SSE2_2 2
+  LOOP_SSE2_2 3
+  LOOP_SSE2_2 4
+  LOOP_SSE2_2 5
 
-.finished
-	ret
+  ret