[XviD-devel] SAD vs MSE for ME (warning! acronym overflow!)
skal
xvid-devel@xvid.org
04 Sep 2002 17:35:08 +0200
--=-QnEAzBT0gUWiYswfva1q
Content-Type: text/plain
Content-Transfer-Encoding: 7bit
> Gruel wrote:
>No, actually MSE/SSE would be the better choice (at least that's what
>theory says). MAD/SAD is only chosen because it's faster to compute and
>experiment (not mine) showed that it's not much of a difference.
Ok, fine. I think there's something to do to reduce the HPel/QPel
refinements steps for ME, using SSE, then...
More on this later (I need to test the idea first).
Do you have any 'theoretical' pointers?
BTW, I attach some ASM funcs you might find useful...
It computes the mean and mean-squared of blocks.
later,
Skal
--=-QnEAzBT0gUWiYswfva1q
Content-Disposition: attachment; filename=skl_img_x86.asm
Content-Transfer-Encoding: quoted-printable
Content-Type: text/plain; name=skl_img_x86.asm; charset=ISO-8859-1
;//////////////////////////////////////////////////////////////////////
; [BITS 32]
%include "../../include/skl_syst/skl_nasm.h"
; all signatures are: uint32_t Func(uint8 *Source, int32_t stride)
cglobal Skl_Mean_16x16_MMX
cglobal Skl_Mean_8x8_MMX
cglobal Skl_Mean_16x16_SSE
cglobal Skl_Mean_8x8_SSE
cglobal Skl_Sqr_16x16_MMX
cglobal Skl_Sqr_8x8_MMX
DATA
align 16
MMX_UNIT: dw 1, 1, 1, 1
TEXT
;//////////////////////////////////////////////////////////////////////
; Skl_Mean_16x16_MMX
;//////////////////////////////////////////////////////////////////////
%macro MEAN_16x16_MMX 0
movq mm0, [eax]
movq mm2, [eax+8]
lea eax,[eax+ecx]
movq mm1, mm0
movq mm3, mm2
punpcklbw mm0,mm7
punpcklbw mm2,mm7
punpckhbw mm1,mm7
punpckhbw mm3,mm7
paddw mm5, mm0
paddw mm6, mm1
paddw mm5, mm2
paddw mm6, mm3
%endmacro
align 16
Skl_Mean_16x16_MMX:
emms
mov eax, [esp+ 4] ; Src
mov ecx, [esp+ 8] ; BpS
=20
pxor mm5, mm5 ; accums
pxor mm6, mm6 ; accums
pxor mm7, mm7 ; zero
.Loop
MEAN_16x16_MMX
MEAN_16x16_MMX
MEAN_16x16_MMX
MEAN_16x16_MMX
MEAN_16x16_MMX
MEAN_16x16_MMX
MEAN_16x16_MMX
MEAN_16x16_MMX
MEAN_16x16_MMX
MEAN_16x16_MMX
MEAN_16x16_MMX
MEAN_16x16_MMX
MEAN_16x16_MMX
MEAN_16x16_MMX
MEAN_16x16_MMX
MEAN_16x16_MMX
paddusw mm6, mm5
pmaddwd mm6, [MMX_UNIT] ; collapse
movq mm7, mm6
psrlq mm7, 32=20
paddd mm6, mm7
movd eax, mm6
shr eax, 8
emms
ret
;//////////////////////////////////////////////////////////////////////
; Skl_Mean_8x8_MMX
;//////////////////////////////////////////////////////////////////////
%macro MEAN_8x8_MMX 0
movq mm0, [eax]
movq mm2, [eax+ecx]
lea eax,[eax+2*ecx]
movq mm1, mm0
movq mm3, mm2
punpcklbw mm0,mm7
punpcklbw mm2,mm7
punpckhbw mm1,mm7
punpckhbw mm3,mm7
paddw mm5, mm0
paddw mm6, mm1
paddw mm5, mm2
paddw mm6, mm3
%endmacro
align 16
Skl_Mean_8x8_MMX:
emms
mov eax, [esp+ 4] ; Src
mov ecx, [esp+ 8] ; BpS
=20
pxor mm5, mm5 ; accums
pxor mm6, mm6 ; accums
pxor mm7, mm7 ; zero
.Loop
MEAN_8x8_MMX
MEAN_8x8_MMX
MEAN_8x8_MMX
MEAN_8x8_MMX
paddw mm6, mm5
pmaddwd mm6, [MMX_UNIT] ; collapse
movq mm7, mm6
psrlq mm7, 32=20
paddd mm6, mm7
movd eax, mm6
shr eax,6
emms
ret
;//////////////////////////////////////////////////////////////////////
; Skl_Mean_16x16_SSE
;//////////////////////////////////////////////////////////////////////
%macro MEAN_16x16_SSE 0
movq mm0, [eax]
movq mm1, [eax+8]
psadbw mm0, mm7
psadbw mm1, mm7
add eax, ecx
paddw mm5, mm0
paddw mm6, mm1
%endmacro
align 16
Skl_Mean_16x16_SSE: ; 97
emms
mov eax, [esp+ 4] ; Src
mov ecx, [esp+ 8] ; BpS
=20
pxor mm5, mm5 ; accum
pxor mm6, mm6 ; accum
pxor mm7, mm7 ; zero
MEAN_16x16_SSE
MEAN_16x16_SSE
MEAN_16x16_SSE
MEAN_16x16_SSE
MEAN_16x16_SSE
MEAN_16x16_SSE
MEAN_16x16_SSE
MEAN_16x16_SSE
MEAN_16x16_SSE
MEAN_16x16_SSE
MEAN_16x16_SSE
MEAN_16x16_SSE
MEAN_16x16_SSE
MEAN_16x16_SSE
MEAN_16x16_SSE
MEAN_16x16_SSE
paddusw mm6,mm5
pmaddwd mm6, [MMX_UNIT] ; collapse
movq mm7, mm6
psllq mm7, 32=20
paddd mm6, mm7
movd eax, mm6
shr eax, 8
emms
ret
;//////////////////////////////////////////////////////////////////////
; Skl_Mean_8x8_SSE
;//////////////////////////////////////////////////////////////////////
%macro MEAN_8x8_SSE 0
movq mm0, [eax]
movq mm1, [eax+ecx]
psadbw mm0, mm7
add eax, edx
psadbw mm1, mm7
paddw mm6, mm0
paddw mm6, mm1
%endmacro
align 16
Skl_Mean_8x8_SSE: ; 34c
emms
;SKL_RDTSC_IN
mov eax, [esp+ 4] ; Src
mov ecx, [esp+ 8] ; BpS
pxor mm6, mm6 ; accum
pxor mm7, mm7 ; zero
lea edx, [ecx+ecx]
pxor mm6, mm6 ; this is a NOP
MEAN_8x8_SSE
MEAN_8x8_SSE
MEAN_8x8_SSE
MEAN_8x8_SSE
pmaddwd mm6, [MMX_UNIT] ; collapse
movq mm7, mm6
psrlq mm7, 32=20
paddd mm6, mm7
movd eax, mm6
shr eax,6
;SKL_RDTSC_OUT
emms
ret
;//////////////////////////////////////////////////////////////////////
; Skl_Sqr_16x16_MMX
;//////////////////////////////////////////////////////////////////////
%macro SQR_16x16_MMX 0
movq mm0, [eax]
movq mm2, [eax+8]
lea eax,[eax+ecx]
movq mm1, mm0
movq mm3, mm2
punpcklbw mm0, mm6
punpcklbw mm2, mm6
punpckhbw mm1, mm6
punpckhbw mm3, mm6
pmaddwd mm0, mm0
pmaddwd mm1, mm1
pmaddwd mm2, mm2
pmaddwd mm3, mm3
paddd mm7, mm0
paddd mm7, mm1
paddd mm7, mm2
paddd mm7, mm3
%endmacro
align 16
Skl_Sqr_16x16_MMX:
emms
mov eax, [esp+ 4] ; Src
mov ecx, [esp+ 8] ; BpS
=20
pxor mm7, mm7 ; accum
pxor mm6, mm6 ; zero
.Loop
SQR_16x16_MMX
SQR_16x16_MMX
SQR_16x16_MMX
SQR_16x16_MMX
SQR_16x16_MMX
SQR_16x16_MMX
SQR_16x16_MMX
SQR_16x16_MMX
SQR_16x16_MMX
SQR_16x16_MMX
SQR_16x16_MMX
SQR_16x16_MMX
SQR_16x16_MMX
SQR_16x16_MMX
SQR_16x16_MMX
SQR_16x16_MMX
movq mm6, mm7
psrlq mm7, 32=20
paddd mm6, mm7
movd eax, mm6
shr eax, 8
emms
ret
;//////////////////////////////////////////////////////////////////////
; Skl_Sqr_8x8_MMX
;//////////////////////////////////////////////////////////////////////
%macro SQR_8x8_MMX 0
movq mm0, [eax]
movq mm2, [eax+ecx]
lea eax,[eax+2*ecx]
movq mm1, mm0
movq mm3, mm2
punpcklbw mm0, mm6
punpcklbw mm2, mm6
punpckhbw mm1, mm6
punpckhbw mm3, mm6
pmaddwd mm0, mm0
pmaddwd mm1, mm1
pmaddwd mm2, mm2
pmaddwd mm3, mm3
paddd mm7, mm0
paddd mm7, mm1
paddd mm7, mm2
paddd mm7, mm3
%endmacro
align 16
Skl_Sqr_8x8_MMX:
emms
mov eax, [esp+ 4] ; Src
mov ecx, [esp+ 8] ; BpS
=20
pxor mm7, mm7 ; accum
pxor mm6, mm6 ; zero
.Loop
SQR_8x8_MMX
SQR_8x8_MMX
SQR_8x8_MMX
SQR_8x8_MMX
movq mm6, mm7
psrlq mm7, 32=20
paddd mm6, mm7
movd eax, mm6
shr eax,6
emms
ret
;//////////////////////////////////////////////////////////////////////
--=-QnEAzBT0gUWiYswfva1q--