[XviD-devel] need sad8bi for direct mode

peter ross xvid-devel@xvid.org
Tue, 23 Jul 2002 15:54:56 +1000


>From: "Michael Militzer" <michael@xvid.org>
>Reply-To: xvid-devel@xvid.org
>To: <xvid-devel@xvid.org>
>Subject: Re: [XviD-devel] need sad8bi for direct mode
>Date: Mon, 22 Jul 2002 13:46:58 +0200
>
>Hi,
>
>just a sidenote: I see that you use pavgb instructions within the mmx code.
>AFAIK pavgb is a xmm instruction, so I suppose the mmx version won't run on
>"mmx-only" machines...
>

opps. it should be now fixed.
okay, so we now have:

sad[16,8]bi_c
sad[16,8]bi_mmx
sad[16,8]bi_xmm

todo:
sad[16,8]bi_3dn    ; k6-2/3 have pavgusb, but not psadb
sad16bi_sse2

-- pete

;===========================================================================
;
; uint32_t sad16bi_mmx(const uint8_t * const cur,
;					const uint8_t * const ref1,
;					const uint8_t * const ref2,
;					const uint32_t stride);
;
;===========================================================================
%macro SADBI_16x16_MMX 2    ; SADBI_16x16_MMX( int_ptr_offset, 
bool_increment_ptr );

    movq mm0, [edx+%1]
    movq mm2, [ebx+%1]
    movq mm1, mm0
    movq mm3, mm2

%if %2 != 0
    add edx, ecx
%endif

    punpcklbw mm0, mm7
    punpckhbw mm1, mm7
	punpcklbw mm2, mm7
	punpckhbw mm3, mm7

%if %2 != 0
    add ebx, ecx
%endif

	paddusw mm0, mm2    		; mm01 = ref1 + ref2
	paddusw mm1, mm3
	paddusw mm0, [mmx_one]		; mm01 += 1
	paddusw mm1, [mmx_one]
	psrlw mm0, 1			    ; mm01 >>= 1
	psrlw mm1, 1

    movq mm2, [eax+%1]
    movq mm3, mm2
    punpcklbw mm2, mm7          ; mm23 = src
    punpckhbw mm3, mm7

%if %2 != 0
    add eax, ecx
%endif

    movq mm4, mm0
    movq mm5, mm1
    psubusw mm0, mm2
    psubusw mm1, mm3
    psubusw mm2, mm4
    psubusw mm3, mm5
    por mm0, mm2                ; mm01 = ABS(mm01 - mm23)
    por mm1, mm3

    paddusw mm6,mm0             ; mm6 += mm01
    paddusw mm6,mm1

%endmacro

align 16
sad16bi_mmx:
    push ebx
    mov eax, [esp+4+ 4] ; Src
    mov edx, [esp+4+ 8] ; Ref1
    mov ebx, [esp+4+12] ; Ref2
    mov ecx, [esp+4+16] ; Stride

    pxor mm6, mm6 ; accum2
	pxor mm7, mm7
.Loop
    SADBI_16x16_MMX 0, 0
    SADBI_16x16_MMX 8, 1
    SADBI_16x16_MMX 0, 0
    SADBI_16x16_MMX 8, 1
    SADBI_16x16_MMX 0, 0
    SADBI_16x16_MMX 8, 1
    SADBI_16x16_MMX 0, 0
    SADBI_16x16_MMX 8, 1
    SADBI_16x16_MMX 0, 0
    SADBI_16x16_MMX 8, 1
    SADBI_16x16_MMX 0, 0
    SADBI_16x16_MMX 8, 1
    SADBI_16x16_MMX 0, 0
    SADBI_16x16_MMX 8, 1
    SADBI_16x16_MMX 0, 0
    SADBI_16x16_MMX 8, 1

    SADBI_16x16_MMX 0, 0
    SADBI_16x16_MMX 8, 1
    SADBI_16x16_MMX 0, 0
    SADBI_16x16_MMX 8, 1
    SADBI_16x16_MMX 0, 0
    SADBI_16x16_MMX 8, 1
    SADBI_16x16_MMX 0, 0
    SADBI_16x16_MMX 8, 1
    SADBI_16x16_MMX 0, 0
    SADBI_16x16_MMX 8, 1
    SADBI_16x16_MMX 0, 0
    SADBI_16x16_MMX 8, 1
    SADBI_16x16_MMX 0, 0
    SADBI_16x16_MMX 8, 1
    SADBI_16x16_MMX 0, 0
    SADBI_16x16_MMX 8, 1

    pmaddwd mm6, [mmx_one] ; collapse
    movq mm7, mm6
    psrlq mm7, 32
    paddd mm6, mm7

    movd eax, mm6
    pop ebx
    ret

;===========================================================================
;
; uint32_t sad8bi_mmx(const uint8_t * const cur,
;					const uint8_t * const ref1,
;					const uint8_t * const ref2,
;					const uint32_t stride);
;
;===========================================================================
align 16
sad8bi_mmx:
    push ebx
    mov eax, [esp+4+ 4] ; Src
    mov edx, [esp+4+ 8] ; Ref1
    mov ebx, [esp+4+12] ; Ref2
    mov ecx, [esp+4+16] ; Stride

    pxor mm6, mm6 ; accum2
	pxor mm7, mm7
.Loop
    SADBI_16x16_MMX 0, 1
    SADBI_16x16_MMX 0, 1
    SADBI_16x16_MMX 0, 1
    SADBI_16x16_MMX 0, 1
    SADBI_16x16_MMX 0, 1
    SADBI_16x16_MMX 0, 1
    SADBI_16x16_MMX 0, 1
    SADBI_16x16_MMX 0, 1

    pmaddwd mm6, [mmx_one] ; collapse
    movq mm7, mm6
    psrlq mm7, 32
    paddd mm6, mm7

    movd eax, mm6
    pop ebx
    ret



_________________________________________________________________
MSN Photos is the easiest way to share and print your photos: 
http://photos.msn.com/support/worldwide.aspx