[XviD-devel] [PATCH] calc_cbp_sse2 optimization

Mat Hostetter mat at curl.com
Sun Apr 18 16:48:54 CEST 2004


This change (against 1.0.0-rc4) speeds up calc_cbp_sse2 from 131
cycles to 112 cycles on the Pentium 4 (for the in-cache case).

This change uses pcmpgtb/pmovmskb to extract a zero/nonzero mask,
rather than the longer sequence used previously, and eliminates all
conditional branches.  I also changed a movdqu to movdqa; if there's a
reason the load might be unaligned (some bogus platform that can't
align static arrays mod 16?) please let me know.

I am new to XviD so I don't know what your standard practice is for
correctness validation and benchmarking.  So I wrote my own test and
benchmark for this proc.  My test tries the 258048 coeff[] arrays I
consider "interesting" with both calc_cbp_sse2 and calc_cbp_plain.
I always get the same result so I am pretty confident this patch is
correct.

I chose calc_cbp_sse2 at random just to get a feel for XviD's sources.
If someone can point me at some more important code you'd like
optimized, and tell me how you benchmark it, I may be able to
contribute (I'm a professional compiler programmer).  I'm sure
you've done lots of optimizations already but another pair of eyes
never hurts.  :-)

-Mat


--- src/bitstream/x86_asm/cbp_sse2.asm~	2004-04-04 16:35:53.000000000 -0400
+++ src/bitstream/x86_asm/cbp_sse2.asm	2004-04-18 10:37:02.000000000 -0400
@@ -52,13 +52,17 @@
   por xmm1, [edx+(%1)*128+112]
 
   por xmm0, xmm1        ; xmm0 = xmm1 = 128 bits worth of info
-  psadbw xmm0, xmm6     ; contains 2 dwords with sums
-  movhlps xmm1, xmm0    ; move high dword from xmm0 to low xmm1
-  por xmm0, xmm1        ; combine
-  movd ecx, xmm0        ; if ecx set, values were found
-  test ecx, ecx
+
+  pcmpgtb xmm0, xmm6
 %endmacro
 
+%macro LOOP_SSE2_2 1
+  LOOP_SSE2 %1
+  pmovmskb ecx, xmm0
+  add ecx, 0xFFFFFFFF
+  adc eax, eax
+%endmacro        
+        
 ;=============================================================================
 ; Data (Read Only)
 ;=============================================================================
@@ -87,45 +91,18 @@
 cglobal calc_cbp_sse2
 calc_cbp_sse2:
   mov edx, [esp+4]         ; coeff[]
-  xor eax, eax             ; cbp = 0
-
-  movdqu xmm7, [ignore_dc] ; mask to ignore dc value
+  movdqa xmm7, [ignore_dc] ; mask to ignore dc value
   pxor xmm6, xmm6          ; zero
 
   LOOP_SSE2 0
-  test ecx, ecx
-  jz .blk2
-  or eax, (1<<5)
-
-.blk2
-  LOOP_SSE2 1
-  test ecx, ecx
-  jz .blk3
-  or eax, (1<<4)
-
-.blk3
-  LOOP_SSE2 2
-  test ecx, ecx
-  jz .blk4
-  or eax, (1<<3)
-
-.blk4
-  LOOP_SSE2 3
-  test ecx, ecx
-  jz .blk5
-  or eax, (1<<2)
-
-.blk5
-  LOOP_SSE2 4
-  test ecx, ecx
-  jz .blk6
-  or eax, (1<<1)
-
-.blk6
-  LOOP_SSE2 5
-  test ecx, ecx
-  jz .finished
-  or eax, (1<<0)
+  pmovmskb eax, xmm0
+  add eax, 0x7FFFFFFF
+  shr eax, 31
+
+  LOOP_SSE2_2 1
+  LOOP_SSE2_2 2
+  LOOP_SSE2_2 3
+  LOOP_SSE2_2 4
+  LOOP_SSE2_2 5
 
-.finished
-	ret
+  ret


More information about the XviD-devel mailing list