[XviD-devel] SAD vs MSE for ME (warning! acronym overflow!)

skal xvid-devel@xvid.org
04 Sep 2002 17:35:08 +0200


--=-QnEAzBT0gUWiYswfva1q
Content-Type: text/plain
Content-Transfer-Encoding: 7bit



> Gruel wrote:
>No, actually MSE/SSE would be the better choice (at least that's what
>theory says). MAD/SAD is only chosen because it's faster to compute and
>experiment (not mine) showed that it's not much of a difference.

	Ok, fine. I think there's something to do to reduce the 	HPel/QPel
refinements steps for ME, using SSE, then...
	More on this later (I need to test the idea first).
	Do you have any 'theoretical' pointers?

	BTW, I attach some ASM funcs you might find useful...
	It computes the mean and mean-squared of blocks.

	later,

		Skal



--=-QnEAzBT0gUWiYswfva1q
Content-Disposition: attachment; filename=skl_img_x86.asm
Content-Transfer-Encoding: quoted-printable
Content-Type: text/plain; name=skl_img_x86.asm; charset=ISO-8859-1

;//////////////////////////////////////////////////////////////////////
; [BITS 32]

%include "../../include/skl_syst/skl_nasm.h"

; all signatures are: uint32_t Func(uint8 *Source, int32_t stride)

cglobal Skl_Mean_16x16_MMX
cglobal Skl_Mean_8x8_MMX
cglobal Skl_Mean_16x16_SSE
cglobal Skl_Mean_8x8_SSE
cglobal Skl_Sqr_16x16_MMX
cglobal Skl_Sqr_8x8_MMX

DATA

align 16
MMX_UNIT: dw 1, 1, 1, 1

TEXT

;//////////////////////////////////////////////////////////////////////
; Skl_Mean_16x16_MMX
;//////////////////////////////////////////////////////////////////////

%macro MEAN_16x16_MMX 0
    movq mm0, [eax]
    movq mm2, [eax+8]
    lea eax,[eax+ecx]
    movq mm1, mm0
    movq mm3, mm2
    punpcklbw mm0,mm7
    punpcklbw mm2,mm7
    punpckhbw mm1,mm7
    punpckhbw mm3,mm7
    paddw mm5, mm0
    paddw mm6, mm1
    paddw mm5, mm2
    paddw mm6, mm3
%endmacro

align 16
Skl_Mean_16x16_MMX:
    emms
    mov eax, [esp+ 4] ; Src
    mov ecx, [esp+ 8] ; BpS
   =20
    pxor mm5, mm5 ; accums
    pxor mm6, mm6 ; accums
    pxor mm7, mm7 ; zero

.Loop
    MEAN_16x16_MMX
    MEAN_16x16_MMX
    MEAN_16x16_MMX
    MEAN_16x16_MMX
    MEAN_16x16_MMX
    MEAN_16x16_MMX
    MEAN_16x16_MMX
    MEAN_16x16_MMX

    MEAN_16x16_MMX
    MEAN_16x16_MMX
    MEAN_16x16_MMX
    MEAN_16x16_MMX
    MEAN_16x16_MMX
    MEAN_16x16_MMX
    MEAN_16x16_MMX
    MEAN_16x16_MMX

    paddusw mm6, mm5
    pmaddwd mm6, [MMX_UNIT] ; collapse
    movq mm7, mm6
    psrlq mm7, 32=20
    paddd mm6, mm7

    movd eax, mm6
    shr eax, 8

    emms
    ret


;//////////////////////////////////////////////////////////////////////
; Skl_Mean_8x8_MMX
;//////////////////////////////////////////////////////////////////////

%macro MEAN_8x8_MMX  0
    movq mm0, [eax]
    movq mm2, [eax+ecx]
    lea eax,[eax+2*ecx]
    movq mm1, mm0
    movq mm3, mm2
    punpcklbw mm0,mm7
    punpcklbw mm2,mm7
    punpckhbw mm1,mm7
    punpckhbw mm3,mm7
    paddw mm5, mm0
    paddw mm6, mm1
    paddw mm5, mm2
    paddw mm6, mm3
%endmacro

align 16
Skl_Mean_8x8_MMX:
    emms
    mov eax, [esp+ 4] ; Src
    mov ecx, [esp+ 8] ; BpS
   =20
    pxor mm5, mm5 ; accums
    pxor mm6, mm6 ; accums
    pxor mm7, mm7 ; zero

.Loop
    MEAN_8x8_MMX
    MEAN_8x8_MMX
    MEAN_8x8_MMX
    MEAN_8x8_MMX

    paddw mm6, mm5
    pmaddwd mm6, [MMX_UNIT] ; collapse
    movq mm7, mm6
    psrlq mm7, 32=20
    paddd mm6, mm7

    movd eax, mm6
    shr eax,6
    emms
    ret

;//////////////////////////////////////////////////////////////////////
; Skl_Mean_16x16_SSE
;//////////////////////////////////////////////////////////////////////

%macro MEAN_16x16_SSE 0
    movq mm0, [eax]
    movq mm1, [eax+8]
    psadbw mm0, mm7
    psadbw mm1, mm7
    add eax, ecx
    paddw mm5, mm0
    paddw mm6, mm1
%endmacro

align 16
Skl_Mean_16x16_SSE: ; 97
    emms
    mov eax, [esp+ 4] ; Src
    mov ecx, [esp+ 8] ; BpS
   =20
    pxor mm5, mm5 ; accum
    pxor mm6, mm6 ; accum
    pxor mm7, mm7 ; zero

    MEAN_16x16_SSE
    MEAN_16x16_SSE
    MEAN_16x16_SSE
    MEAN_16x16_SSE
    MEAN_16x16_SSE
    MEAN_16x16_SSE
    MEAN_16x16_SSE
    MEAN_16x16_SSE

    MEAN_16x16_SSE
    MEAN_16x16_SSE
    MEAN_16x16_SSE
    MEAN_16x16_SSE
    MEAN_16x16_SSE
    MEAN_16x16_SSE
    MEAN_16x16_SSE
    MEAN_16x16_SSE

    paddusw mm6,mm5
    pmaddwd mm6, [MMX_UNIT] ; collapse
    movq mm7, mm6
    psllq mm7, 32=20
    paddd mm6, mm7

    movd eax, mm6
    shr eax, 8

    emms
    ret

;//////////////////////////////////////////////////////////////////////
; Skl_Mean_8x8_SSE
;//////////////////////////////////////////////////////////////////////

%macro MEAN_8x8_SSE  0
    movq mm0, [eax]
    movq mm1, [eax+ecx]
    psadbw mm0, mm7
    add eax, edx
    psadbw mm1, mm7
    paddw mm6, mm0
    paddw mm6, mm1
%endmacro

align 16
Skl_Mean_8x8_SSE: ; 34c
    emms
;SKL_RDTSC_IN
    mov eax, [esp+ 4] ; Src
    mov ecx, [esp+ 8] ; BpS

    pxor mm6, mm6 ; accum
    pxor mm7, mm7 ; zero
    lea edx, [ecx+ecx]
    pxor mm6, mm6 ; this is a NOP

    MEAN_8x8_SSE
    MEAN_8x8_SSE
    MEAN_8x8_SSE
    MEAN_8x8_SSE

    pmaddwd mm6, [MMX_UNIT] ; collapse
    movq mm7, mm6
    psrlq mm7, 32=20
    paddd mm6, mm7

    movd eax, mm6
    shr eax,6
;SKL_RDTSC_OUT
    emms
    ret

;//////////////////////////////////////////////////////////////////////
; Skl_Sqr_16x16_MMX
;//////////////////////////////////////////////////////////////////////

%macro SQR_16x16_MMX 0
    movq mm0, [eax]
    movq mm2, [eax+8]
    lea eax,[eax+ecx]
    movq mm1, mm0
    movq mm3, mm2
    punpcklbw mm0, mm6
    punpcklbw mm2, mm6
    punpckhbw mm1, mm6
    punpckhbw mm3, mm6
    pmaddwd mm0, mm0
    pmaddwd mm1, mm1
    pmaddwd mm2, mm2
    pmaddwd mm3, mm3
    paddd mm7, mm0
    paddd mm7, mm1
    paddd mm7, mm2
    paddd mm7, mm3
%endmacro

align 16
Skl_Sqr_16x16_MMX:
    emms
    mov eax, [esp+ 4] ; Src
    mov ecx, [esp+ 8] ; BpS
   =20
    pxor mm7, mm7 ; accum
    pxor mm6, mm6 ; zero

.Loop
    SQR_16x16_MMX
    SQR_16x16_MMX
    SQR_16x16_MMX
    SQR_16x16_MMX
    SQR_16x16_MMX
    SQR_16x16_MMX
    SQR_16x16_MMX
    SQR_16x16_MMX

    SQR_16x16_MMX
    SQR_16x16_MMX
    SQR_16x16_MMX
    SQR_16x16_MMX
    SQR_16x16_MMX
    SQR_16x16_MMX
    SQR_16x16_MMX
    SQR_16x16_MMX

    movq mm6, mm7
    psrlq mm7, 32=20
    paddd mm6, mm7

    movd eax, mm6
    shr eax, 8
    emms
    ret


;//////////////////////////////////////////////////////////////////////
; Skl_Sqr_8x8_MMX
;//////////////////////////////////////////////////////////////////////

%macro SQR_8x8_MMX  0
    movq mm0, [eax]
    movq mm2, [eax+ecx]
    lea eax,[eax+2*ecx]
    movq mm1, mm0
    movq mm3, mm2
    punpcklbw mm0, mm6
    punpcklbw mm2, mm6
    punpckhbw mm1, mm6
    punpckhbw mm3, mm6
    pmaddwd mm0, mm0
    pmaddwd mm1, mm1
    pmaddwd mm2, mm2
    pmaddwd mm3, mm3
    paddd mm7, mm0
    paddd mm7, mm1
    paddd mm7, mm2
    paddd mm7, mm3
%endmacro

align 16
Skl_Sqr_8x8_MMX:
    emms

    mov eax, [esp+ 4] ; Src
    mov ecx, [esp+ 8] ; BpS
   =20
    pxor mm7, mm7 ; accum
    pxor mm6, mm6 ; zero

.Loop
    SQR_8x8_MMX
    SQR_8x8_MMX
    SQR_8x8_MMX
    SQR_8x8_MMX

    movq mm6, mm7
    psrlq mm7, 32=20
    paddd mm6, mm7
    movd eax, mm6
    shr eax,6
    emms
    ret

;//////////////////////////////////////////////////////////////////////

--=-QnEAzBT0gUWiYswfva1q--