[XviD-devel] need sad8bi for direct mode
Michael Militzer
xvid-devel@xvid.org
Mon, 22 Jul 2002 13:46:58 +0200
Hi,
just a sidenote: I see that you use pavgb instructions within the mmx code.
AFAIK pavgb is a xmm instruction, so I suppose the mmx version won't run on
"mmx-only" machines...
Michael
----- Original Message -----
From: "peter ross" <suxen_drol@hotmail.com>
To: <xvid-devel@xvid.org>
Sent: Monday, July 22, 2002 6:51 AM
Subject: Re: [XviD-devel] need sad8bi for direct mode
> >From: Christoph Lampert <chl@math.uni-bonn.de>
> >Reply-To: xvid-devel@xvid.org
> >To: xvid-devel@xvid.org
> >Subject: [XviD-devel] need sad8bi for direct mode
> >Date: Sun, 21 Jul 2002 11:56:23 +0200 (CEST)
> >
> >Hi,
> >
> >I notice that direct mode for bframes needs interpolated sad for
> >8x8 blocks, if it should work together with inter4v mode.
> >I'll add a C-version, but MMX/XMM would of course be better.
> >
> >Also, it definitely needs halfpel interpolation, so currently the
> >only way to really test direct mode b-frames is "with halfpel, but
> >without inter4v".
> >
>
> done! they work in my rand() test environment, but please do some tests
>
> cglobal sad16bi_mmx
>
;===========================================================================
> ;
> ; uint32_t sad16bi_mmx(const uint8_t * const cur,
> ; const uint8_t * const ref1,
> ; const uint8_t * const ref2,
> ; const uint32_t stride);
> ;
>
;===========================================================================
>
> %macro SADBI_16x16_MMX 0
> movq mm0, [eax] ; src
> movq mm2, [eax+8]
>
> movq mm1, [edx] ; ref1
> movq mm3, [edx+8]
> pavgb mm1, [ebx] ; ref2
> lea edx,[edx+ecx]
> pavgb mm3, [ebx+8]
> lea ebx,[ebx+ecx]
>
> movq mm4, mm0
> lea eax,[eax+ecx]
> psubusb mm0, mm1
> movq mm5, mm2
> psubusb mm2, mm3
>
> psubusb mm1, mm4
> por mm0, mm1
> psubusb mm3, mm5
> por mm2, mm3
>
> movq mm1,mm0
> movq mm3,mm2
>
> punpcklbw mm0,mm7
> punpckhbw mm1,mm7
> punpcklbw mm2,mm7
> punpckhbw mm3,mm7
>
> paddusw mm0,mm1
> paddusw mm2,mm3
> paddusw mm6,mm0
> paddusw mm6,mm2
> %endmacro
>
> align 16
> sad16bi_mmx:
> push ebx
> mov eax, [esp+4+ 4] ; Src
> mov edx, [esp+4+ 8] ; Ref1
> mov ebx, [esp+4+12] ; Ref2
> mov ecx, [esp+4+16] ; Stride
>
> pxor mm6, mm6 ; accum2
> pxor mm7, mm7
> .Loop
> SADBI_16x16_MMX
> SADBI_16x16_MMX
> SADBI_16x16_MMX
> SADBI_16x16_MMX
> SADBI_16x16_MMX
> SADBI_16x16_MMX
> SADBI_16x16_MMX
> SADBI_16x16_MMX
>
> SADBI_16x16_MMX
> SADBI_16x16_MMX
> SADBI_16x16_MMX
> SADBI_16x16_MMX
> SADBI_16x16_MMX
> SADBI_16x16_MMX
> SADBI_16x16_MMX
> SADBI_16x16_MMX
>
> pmaddwd mm6, [mmx_one] ; collapse
> movq mm7, mm6
> psrlq mm7, 32
> paddd mm6, mm7
>
> movd eax, mm6
>
> pop ebx
>
> ret
>
>
>
> cglobal sad8bi_mmx
>
;===========================================================================
> ;
> ; uint32_t sad8bi_mmx(const uint8_t * const cur,
> ; const uint8_t * const ref1,
> ; const uint8_t * const ref2,
> ; const uint32_t stride);
> ;
>
;===========================================================================
>
> %macro SADBI_8x8_MMX 0
> movq mm0, [eax] ; src
> movq mm2, [eax+ecx]
>
> movq mm1, [edx] ; ref1
> movq mm3, [edx+ecx]
> pavgb mm1, [ebx] ; ref2
> lea edx,[edx+2*ecx]
> pavgb mm3, [ebx+ecx]
> lea ebx,[ebx+2*ecx]
>
> movq mm4, mm0
> lea eax,[eax+2*ecx]
> psubusb mm0, mm1
> movq mm5, mm2
> psubusb mm2, mm3
>
> psubusb mm1, mm4
> por mm0, mm1
> psubusb mm3, mm5
> por mm2, mm3
>
> movq mm1,mm0
> movq mm3,mm2
>
> punpcklbw mm0,mm7
> punpckhbw mm1,mm7
> punpcklbw mm2,mm7
> punpckhbw mm3,mm7
>
> paddusw mm0,mm1
> paddusw mm2,mm3
> paddusw mm6,mm0
> paddusw mm6,mm2
> %endmacro
>
> align 16
> sad8bi_mmx:
> push ebx
> mov eax, [esp+4+ 4] ; Src
> mov edx, [esp+4+ 8] ; Ref1
> mov ebx, [esp+4+12] ; Ref2
> mov ecx, [esp+4+16] ; Stride
>
> pxor mm6, mm6 ; accum2
> pxor mm7, mm7
> .Loop
> SADBI_8x8_MMX
> SADBI_8x8_MMX
> SADBI_8x8_MMX
> SADBI_8x8_MMX
>
> pmaddwd mm6, [mmx_one] ; collapse
> movq mm7, mm6
> psrlq mm7, 32
> paddd mm6, mm7
>
> movd eax, mm6
>
> pop ebx
>
> ret
>
>
>
> cglobal sad8bi_xmm
>
;===========================================================================
> ;
> ; uint32_t sad8bi_xmm(const uint8_t * const cur,
> ; const uint8_t * const ref1,
> ; const uint8_t * const ref2,
> ; const uint32_t stride);
> ;
>
;===========================================================================
>
> %macro SADBI_8x8_XMM 0
> movq mm0, [eax]
> movq mm1, [eax+ecx]
>
> movq mm2, [edx]
> movq mm3, [edx+ecx]
>
> pavgb mm2, [ebx]
> lea edx, [edx+2*ecx]
>
> pavgb mm3, [ebx+ecx]
> lea ebx, [ebx+2*ecx]
>
> psadbw mm0, mm2
> lea eax, [eax+2*ecx]
>
> psadbw mm1, mm3
> paddusw mm5,mm0
>
> paddusw mm6,mm1
> %endmacro
>
> align 16
> sad8bi_xmm:
> push ebx
> mov eax, [esp+4+ 4] ; Src
> mov edx, [esp+4+ 8] ; Ref1
> mov ebx, [esp+4+12] ; Ref2
> mov ecx, [esp+4+16] ; Stride
>
> pxor mm5, mm5 ; accum1
> pxor mm6, mm6 ; accum2
> .Loop
> SADBI_8x8_XMM
> SADBI_8x8_XMM
> SADBI_8x8_XMM
> SADBI_8x8_XMM
>
> paddusw mm6,mm5
> movd eax, mm6
> pop ebx
> ret
>
>
>
> _________________________________________________________________
> Chat with friends online, try MSN Messenger: http://messenger.msn.com
>
> _______________________________________________
> XviD-devel mailing list
> XviD-devel@xvid.org
> http://list.xvid.org/mailman/listinfo/xvid-devel
>