[XviD-devel] [BUG] dev16 sse2

skal skal at planet-d.net
Mon Nov 3 17:43:50 CET 2003


	Hi all,

On Mon, 2003-11-03 at 16:50, Edouard Gomez wrote:

> I could finally  test SSE2 code. SSE2 code seems to  be faster than mmx,
> xmm (good point).

> But i have seen that:
> SSE2 - sad8 0.044 usec sad=3776
> SSE2 - sad16 0.044 usec sad=27214
> SSE2 - sad16bi 0.348 usec sad=26274
> SSE2 - dev16 0.086 usec sad=4208
> *** CRC ERROR! ***
> 
> dev16 should have returned: MMXEXT - dev16 0.094 usec sad=334


	oops... my fault. Sorry. Should have tested this
	missing 'pshufd'.
	Here's the correct version.

	bye!
Skal


-------------- next part --------------
;
; extern uint32_t dev16_sse2(const uint8_t * const cur, const uint32_t stride);
;

%macro MEAN_16x16_SSE2 0  ; eax: src, ecx:stride, mm7: zero or mean => mm6: result
  movdqu xmm0, [eax]
  movdqu xmm1, [eax+ecx]
  lea eax, [eax+2*ecx]    ; + 2*stride
  psadbw xmm0, xmm7
  paddusw xmm6, xmm0
  psadbw xmm1, xmm7
  paddusw xmm6, xmm1
%endmacro


align 16
dev16_sse2:
  mov eax, [esp+ 4] ; Src
  mov ecx, [esp+ 8] ; BpS

  pxor xmm6, xmm6 ; accum
  pxor xmm7, xmm7 ; zero

  MEAN_16x16_SSE2
  MEAN_16x16_SSE2
  MEAN_16x16_SSE2
  MEAN_16x16_SSE2

  MEAN_16x16_SSE2
  MEAN_16x16_SSE2
  MEAN_16x16_SSE2
  MEAN_16x16_SSE2

  mov eax, [esp+ 4]   ; Src

  pshufd   xmm7, xmm6, 10b
  paddusw  xmm7, xmm6
  pxor     xmm6, xmm6     ; zero accum
  psrlw    xmm7, 8        ; => Mean
  pshuflw  xmm7, xmm7, 0  ; replicate Mean
  packuswb xmm7, xmm7
  pshufd   xmm7, xmm7, 00000000b

  MEAN_16x16_SSE2
  MEAN_16x16_SSE2
  MEAN_16x16_SSE2
  MEAN_16x16_SSE2

  MEAN_16x16_SSE2
  MEAN_16x16_SSE2
  MEAN_16x16_SSE2
  MEAN_16x16_SSE2

  pshufd   xmm7, xmm6, 10b
  paddusw  xmm7, xmm6
  pextrw eax, xmm7, 0
  ret



More information about the XviD-devel mailing list