[XviD-devel] need sad8bi for direct mode

peter ross xvid-devel@xvid.org
Mon, 22 Jul 2002 14:51:34 +1000


>From: Christoph Lampert <chl@math.uni-bonn.de>
>Reply-To: xvid-devel@xvid.org
>To: xvid-devel@xvid.org
>Subject: [XviD-devel] need  sad8bi  for direct mode
>Date: Sun, 21 Jul 2002 11:56:23 +0200 (CEST)
>
>Hi,
>
>I notice that direct mode for bframes needs interpolated sad for
>8x8 blocks, if it should work together with inter4v mode.
>I'll add a C-version, but MMX/XMM would of course be better.
>
>Also, it definitely needs halfpel interpolation, so currently the
>only way to really test direct mode b-frames is  "with halfpel, but
>without inter4v".
>

done! they work in my rand() test environment, but please do some tests

cglobal  sad16bi_mmx
;===========================================================================
;
; uint32_t sad16bi_mmx(const uint8_t * const cur,
;					const uint8_t * const ref1,
;					const uint8_t * const ref2,
;					const uint32_t stride);
;
;===========================================================================

%macro SADBI_16x16_MMX 0
    movq mm0, [eax]		; src
    movq mm2, [eax+8]

    movq mm1, [edx]		; ref1
    movq mm3, [edx+8]
	pavgb mm1, [ebx]	; ref2
    lea edx,[edx+ecx]
	pavgb mm3, [ebx+8]
	lea ebx,[ebx+ecx]

    movq mm4, mm0
    lea eax,[eax+ecx]
    psubusb mm0, mm1
    movq mm5, mm2
    psubusb mm2, mm3

    psubusb mm1, mm4
    por mm0, mm1
    psubusb mm3, mm5
    por mm2, mm3

    movq mm1,mm0
    movq mm3,mm2

    punpcklbw mm0,mm7
    punpckhbw mm1,mm7
    punpcklbw mm2,mm7
    punpckhbw mm3,mm7

    paddusw mm0,mm1
    paddusw mm2,mm3
    paddusw mm6,mm0
    paddusw mm6,mm2
%endmacro

align 16
sad16bi_mmx:
    push ebx
    mov eax, [esp+4+ 4] ; Src
    mov edx, [esp+4+ 8] ; Ref1
    mov ebx, [esp+4+12] ; Ref2
    mov ecx, [esp+4+16] ; Stride

    pxor mm6, mm6 ; accum2
	pxor mm7, mm7
.Loop
    SADBI_16x16_MMX
    SADBI_16x16_MMX
    SADBI_16x16_MMX
    SADBI_16x16_MMX
    SADBI_16x16_MMX
    SADBI_16x16_MMX
    SADBI_16x16_MMX
    SADBI_16x16_MMX

    SADBI_16x16_MMX
    SADBI_16x16_MMX
    SADBI_16x16_MMX
    SADBI_16x16_MMX
    SADBI_16x16_MMX
    SADBI_16x16_MMX
    SADBI_16x16_MMX
    SADBI_16x16_MMX

    pmaddwd mm6, [mmx_one] ; collapse
    movq mm7, mm6
    psrlq mm7, 32
    paddd mm6, mm7

    movd eax, mm6

    pop ebx

    ret



cglobal  sad8bi_mmx
;===========================================================================
;
; uint32_t sad8bi_mmx(const uint8_t * const cur,
;					const uint8_t * const ref1,
;					const uint8_t * const ref2,
;					const uint32_t stride);
;
;===========================================================================

%macro SADBI_8x8_MMX 0
    movq mm0, [eax]		; src
    movq mm2, [eax+ecx]

    movq mm1, [edx]		; ref1
    movq mm3, [edx+ecx]
	pavgb mm1, [ebx]	; ref2
    lea edx,[edx+2*ecx]
	pavgb mm3, [ebx+ecx]
	lea ebx,[ebx+2*ecx]

    movq mm4, mm0
    lea eax,[eax+2*ecx]
    psubusb mm0, mm1
    movq mm5, mm2
    psubusb mm2, mm3

    psubusb mm1, mm4
    por mm0, mm1
    psubusb mm3, mm5
    por mm2, mm3

    movq mm1,mm0
    movq mm3,mm2

    punpcklbw mm0,mm7
    punpckhbw mm1,mm7
    punpcklbw mm2,mm7
    punpckhbw mm3,mm7

    paddusw mm0,mm1
    paddusw mm2,mm3
    paddusw mm6,mm0
    paddusw mm6,mm2
%endmacro

align 16
sad8bi_mmx:
    push ebx
    mov eax, [esp+4+ 4] ; Src
    mov edx, [esp+4+ 8] ; Ref1
    mov ebx, [esp+4+12] ; Ref2
    mov ecx, [esp+4+16] ; Stride

    pxor mm6, mm6 ; accum2
	pxor mm7, mm7
.Loop
    SADBI_8x8_MMX
    SADBI_8x8_MMX
    SADBI_8x8_MMX
    SADBI_8x8_MMX

    pmaddwd mm6, [mmx_one] ; collapse
    movq mm7, mm6
    psrlq mm7, 32
    paddd mm6, mm7

    movd eax, mm6

    pop ebx

    ret



cglobal  sad8bi_xmm
;===========================================================================
;
; uint32_t sad8bi_xmm(const uint8_t * const cur,
;					const uint8_t * const ref1,
;					const uint8_t * const ref2,
;					const uint32_t stride);
;
;===========================================================================

%macro SADBI_8x8_XMM 0
    movq mm0, [eax]
    movq mm1, [eax+ecx]

    movq mm2, [edx]
    movq mm3, [edx+ecx]

    pavgb mm2, [ebx]
    lea edx, [edx+2*ecx]

    pavgb mm3, [ebx+ecx]
    lea ebx, [ebx+2*ecx]

    psadbw mm0, mm2
    lea eax, [eax+2*ecx]

    psadbw mm1, mm3
    paddusw mm5,mm0

    paddusw mm6,mm1
%endmacro

align 16
sad8bi_xmm:
    push ebx
    mov eax, [esp+4+ 4] ; Src
    mov edx, [esp+4+ 8] ; Ref1
    mov ebx, [esp+4+12] ; Ref2
    mov ecx, [esp+4+16] ; Stride

    pxor mm5, mm5 ; accum1
    pxor mm6, mm6 ; accum2
.Loop
    SADBI_8x8_XMM
    SADBI_8x8_XMM
    SADBI_8x8_XMM
    SADBI_8x8_XMM

    paddusw mm6,mm5
    movd eax, mm6
    pop ebx
    ret



_________________________________________________________________
Chat with friends online, try MSN Messenger: http://messenger.msn.com