[XviD-devel] need sad8bi for direct mode

Michael Militzer xvid-devel@xvid.org
Mon, 22 Jul 2002 13:46:58 +0200


Hi,

just a sidenote: I see that you use pavgb instructions within the mmx code.
AFAIK pavgb is a xmm instruction, so I suppose the mmx version won't run on
"mmx-only" machines...

Michael


----- Original Message -----
From: "peter ross" <suxen_drol@hotmail.com>
To: <xvid-devel@xvid.org>
Sent: Monday, July 22, 2002 6:51 AM
Subject: Re: [XviD-devel] need sad8bi for direct mode


> >From: Christoph Lampert <chl@math.uni-bonn.de>
> >Reply-To: xvid-devel@xvid.org
> >To: xvid-devel@xvid.org
> >Subject: [XviD-devel] need  sad8bi  for direct mode
> >Date: Sun, 21 Jul 2002 11:56:23 +0200 (CEST)
> >
> >Hi,
> >
> >I notice that direct mode for bframes needs interpolated sad for
> >8x8 blocks, if it should work together with inter4v mode.
> >I'll add a C-version, but MMX/XMM would of course be better.
> >
> >Also, it definitely needs halfpel interpolation, so currently the
> >only way to really test direct mode b-frames is  "with halfpel, but
> >without inter4v".
> >
>
> done! they work in my rand() test environment, but please do some tests
>
> cglobal  sad16bi_mmx
>
;===========================================================================
> ;
> ; uint32_t sad16bi_mmx(const uint8_t * const cur,
> ; const uint8_t * const ref1,
> ; const uint8_t * const ref2,
> ; const uint32_t stride);
> ;
>
;===========================================================================
>
> %macro SADBI_16x16_MMX 0
>     movq mm0, [eax] ; src
>     movq mm2, [eax+8]
>
>     movq mm1, [edx] ; ref1
>     movq mm3, [edx+8]
> pavgb mm1, [ebx] ; ref2
>     lea edx,[edx+ecx]
> pavgb mm3, [ebx+8]
> lea ebx,[ebx+ecx]
>
>     movq mm4, mm0
>     lea eax,[eax+ecx]
>     psubusb mm0, mm1
>     movq mm5, mm2
>     psubusb mm2, mm3
>
>     psubusb mm1, mm4
>     por mm0, mm1
>     psubusb mm3, mm5
>     por mm2, mm3
>
>     movq mm1,mm0
>     movq mm3,mm2
>
>     punpcklbw mm0,mm7
>     punpckhbw mm1,mm7
>     punpcklbw mm2,mm7
>     punpckhbw mm3,mm7
>
>     paddusw mm0,mm1
>     paddusw mm2,mm3
>     paddusw mm6,mm0
>     paddusw mm6,mm2
> %endmacro
>
> align 16
> sad16bi_mmx:
>     push ebx
>     mov eax, [esp+4+ 4] ; Src
>     mov edx, [esp+4+ 8] ; Ref1
>     mov ebx, [esp+4+12] ; Ref2
>     mov ecx, [esp+4+16] ; Stride
>
>     pxor mm6, mm6 ; accum2
> pxor mm7, mm7
> .Loop
>     SADBI_16x16_MMX
>     SADBI_16x16_MMX
>     SADBI_16x16_MMX
>     SADBI_16x16_MMX
>     SADBI_16x16_MMX
>     SADBI_16x16_MMX
>     SADBI_16x16_MMX
>     SADBI_16x16_MMX
>
>     SADBI_16x16_MMX
>     SADBI_16x16_MMX
>     SADBI_16x16_MMX
>     SADBI_16x16_MMX
>     SADBI_16x16_MMX
>     SADBI_16x16_MMX
>     SADBI_16x16_MMX
>     SADBI_16x16_MMX
>
>     pmaddwd mm6, [mmx_one] ; collapse
>     movq mm7, mm6
>     psrlq mm7, 32
>     paddd mm6, mm7
>
>     movd eax, mm6
>
>     pop ebx
>
>     ret
>
>
>
> cglobal  sad8bi_mmx
>
;===========================================================================
> ;
> ; uint32_t sad8bi_mmx(const uint8_t * const cur,
> ; const uint8_t * const ref1,
> ; const uint8_t * const ref2,
> ; const uint32_t stride);
> ;
>
;===========================================================================
>
> %macro SADBI_8x8_MMX 0
>     movq mm0, [eax] ; src
>     movq mm2, [eax+ecx]
>
>     movq mm1, [edx] ; ref1
>     movq mm3, [edx+ecx]
> pavgb mm1, [ebx] ; ref2
>     lea edx,[edx+2*ecx]
> pavgb mm3, [ebx+ecx]
> lea ebx,[ebx+2*ecx]
>
>     movq mm4, mm0
>     lea eax,[eax+2*ecx]
>     psubusb mm0, mm1
>     movq mm5, mm2
>     psubusb mm2, mm3
>
>     psubusb mm1, mm4
>     por mm0, mm1
>     psubusb mm3, mm5
>     por mm2, mm3
>
>     movq mm1,mm0
>     movq mm3,mm2
>
>     punpcklbw mm0,mm7
>     punpckhbw mm1,mm7
>     punpcklbw mm2,mm7
>     punpckhbw mm3,mm7
>
>     paddusw mm0,mm1
>     paddusw mm2,mm3
>     paddusw mm6,mm0
>     paddusw mm6,mm2
> %endmacro
>
> align 16
> sad8bi_mmx:
>     push ebx
>     mov eax, [esp+4+ 4] ; Src
>     mov edx, [esp+4+ 8] ; Ref1
>     mov ebx, [esp+4+12] ; Ref2
>     mov ecx, [esp+4+16] ; Stride
>
>     pxor mm6, mm6 ; accum2
> pxor mm7, mm7
> .Loop
>     SADBI_8x8_MMX
>     SADBI_8x8_MMX
>     SADBI_8x8_MMX
>     SADBI_8x8_MMX
>
>     pmaddwd mm6, [mmx_one] ; collapse
>     movq mm7, mm6
>     psrlq mm7, 32
>     paddd mm6, mm7
>
>     movd eax, mm6
>
>     pop ebx
>
>     ret
>
>
>
> cglobal  sad8bi_xmm
>
;===========================================================================
> ;
> ; uint32_t sad8bi_xmm(const uint8_t * const cur,
> ; const uint8_t * const ref1,
> ; const uint8_t * const ref2,
> ; const uint32_t stride);
> ;
>
;===========================================================================
>
> %macro SADBI_8x8_XMM 0
>     movq mm0, [eax]
>     movq mm1, [eax+ecx]
>
>     movq mm2, [edx]
>     movq mm3, [edx+ecx]
>
>     pavgb mm2, [ebx]
>     lea edx, [edx+2*ecx]
>
>     pavgb mm3, [ebx+ecx]
>     lea ebx, [ebx+2*ecx]
>
>     psadbw mm0, mm2
>     lea eax, [eax+2*ecx]
>
>     psadbw mm1, mm3
>     paddusw mm5,mm0
>
>     paddusw mm6,mm1
> %endmacro
>
> align 16
> sad8bi_xmm:
>     push ebx
>     mov eax, [esp+4+ 4] ; Src
>     mov edx, [esp+4+ 8] ; Ref1
>     mov ebx, [esp+4+12] ; Ref2
>     mov ecx, [esp+4+16] ; Stride
>
>     pxor mm5, mm5 ; accum1
>     pxor mm6, mm6 ; accum2
> .Loop
>     SADBI_8x8_XMM
>     SADBI_8x8_XMM
>     SADBI_8x8_XMM
>     SADBI_8x8_XMM
>
>     paddusw mm6,mm5
>     movd eax, mm6
>     pop ebx
>     ret
>
>
>
> _________________________________________________________________
> Chat with friends online, try MSN Messenger: http://messenger.msn.com
>
> _______________________________________________
> XviD-devel mailing list
> XviD-devel@xvid.org
> http://list.xvid.org/mailman/listinfo/xvid-devel
>