[XviD-devel] [BUG] dev16 sse2
skal
skal at planet-d.net
Mon Nov 3 17:43:50 CET 2003
Hi all,
On Mon, 2003-11-03 at 16:50, Edouard Gomez wrote:
> I could finally test SSE2 code. SSE2 code seems to be faster than mmx,
> xmm (good point).
> But i have seen that:
> SSE2 - sad8 0.044 usec sad=3776
> SSE2 - sad16 0.044 usec sad=27214
> SSE2 - sad16bi 0.348 usec sad=26274
> SSE2 - dev16 0.086 usec sad=4208
> *** CRC ERROR! ***
>
> dev16 should have returned: MMXEXT - dev16 0.094 usec sad=334
oops... my fault. Sorry. Should have tested this
missing 'pshufd'.
Here's the correct version.
bye!
Skal
-------------- next part --------------
;
; extern uint32_t dev16_sse2(const uint8_t * const cur, const uint32_t stride);
;
%macro MEAN_16x16_SSE2 0 ; eax: src, ecx:stride, mm7: zero or mean => mm6: result
movdqu xmm0, [eax]
movdqu xmm1, [eax+ecx]
lea eax, [eax+2*ecx] ; + 2*stride
psadbw xmm0, xmm7
paddusw xmm6, xmm0
psadbw xmm1, xmm7
paddusw xmm6, xmm1
%endmacro
align 16
dev16_sse2:
mov eax, [esp+ 4] ; Src
mov ecx, [esp+ 8] ; BpS
pxor xmm6, xmm6 ; accum
pxor xmm7, xmm7 ; zero
MEAN_16x16_SSE2
MEAN_16x16_SSE2
MEAN_16x16_SSE2
MEAN_16x16_SSE2
MEAN_16x16_SSE2
MEAN_16x16_SSE2
MEAN_16x16_SSE2
MEAN_16x16_SSE2
mov eax, [esp+ 4] ; Src
pshufd xmm7, xmm6, 10b
paddusw xmm7, xmm6
pxor xmm6, xmm6 ; zero accum
psrlw xmm7, 8 ; => Mean
pshuflw xmm7, xmm7, 0 ; replicate Mean
packuswb xmm7, xmm7
pshufd xmm7, xmm7, 00000000b
MEAN_16x16_SSE2
MEAN_16x16_SSE2
MEAN_16x16_SSE2
MEAN_16x16_SSE2
MEAN_16x16_SSE2
MEAN_16x16_SSE2
MEAN_16x16_SSE2
MEAN_16x16_SSE2
pshufd xmm7, xmm6, 10b
paddusw xmm7, xmm6
pextrw eax, xmm7, 0
ret
More information about the XviD-devel
mailing list