[XviD-devel] asm-ed sad32v : attached
Jim Hauxwell
xvid-devel@xvid.org
Mon, 16 Dec 2002 19:27:55 -0000
This is a multi-part message in MIME format.
------=_NextPart_000_0003_01C2A539.3BEEF470
Content-Type: text/plain;
charset="iso-8859-1"
Content-Transfer-Encoding: 7bit
Hi,
I've attached the xxm version of sad32v. I wrote it in intrinsics, to which
the compiler I has done a very good job, and pasted it into the .asm. The
original C is included for reference. If this is OK, then I will clean it
up and repost.
Jim
------=_NextPart_000_0003_01C2A539.3BEEF470
Content-Type: text/plain;
name="sad_xmm.asm"
Content-Transfer-Encoding: quoted-printable
Content-Disposition: attachment;
filename="sad_xmm.asm"
;/***********************************************************************=
***
; *
; * XVID MPEG-4 VIDEO CODEC
; * xmm sum of absolute difference
; *
; * This program is an implementation of a part of one or more MPEG-4
; * Video tools as specified in ISO/IEC 14496-2 standard. Those =
intending
; * to use this software module in hardware or software products are
; * advised that its use may infringe existing patents or copyrights, =
and
; * any such use would be at such party's own risk. The original
; * developer of this software module and his/her company, and =
subsequent
; * editors and their companies, will have no liability for use of this
; * software or modifications or derivatives thereof.
; *
; * This program is free software; you can redistribute it and/or modify
; * it under the terms of the GNU General Public License as published by
; * the Free Software Foundation; either version 2 of the License, or
; * (at your option) any later version.
; *
; * This program is distributed in the hope that it will be useful,
; * but WITHOUT ANY WARRANTY; without even the implied warranty of
; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
; * GNU General Public License for more details.
; *
; * You should have received a copy of the GNU General Public License
; * along with this program; if not, write to the Free Software
; * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
; *
; =
*************************************************************************=
/
;/***********************************************************************=
***
; *
; * History:
; *
; * 23.07.2002 sad8bi_xmm; <pross@xvid.org>
; * 04.06.2002 rewrote some funcs (XMM mainly) -Skal-
; * 17.11.2001 bugfix and small improvement for dev16_xmm,
; * removed terminate early in sad16_xmm (Isibaar)
; * 12.11.2001 inital version; (c)2001 peter ross <pross@cs.rmit.edu.au>
; *
; =
*************************************************************************=
/
bits 32
%macro cglobal 1=20
%ifdef PREFIX
global _%1=20
%define %1 _%1
%else
global %1
%endif
%endmacro
section .data
align 16
mmx_one times 4 dw 1
section .text
cglobal sad16_xmm
cglobal sad8_xmm
cglobal sad16bi_xmm
cglobal sad8bi_xmm
cglobal dev16_xmm
;=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D
;
; uint32_t sad16_xmm(const uint8_t * const cur,
; const uint8_t * const ref,
; const uint32_t stride,
; const uint32_t best_sad);
;
;=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D
%macro SAD_16x16_SSE 0
movq mm0, [eax]
psadbw mm0, [edx]
movq mm1, [eax+8]
add eax, ecx
psadbw mm1, [edx+8]
paddusw mm5,mm0
add edx, ecx
paddusw mm6,mm1
%endmacro
align 16
sad16_xmm:
mov eax, [esp+ 4] ; Src1
mov edx, [esp+ 8] ; Src2
mov ecx, [esp+12] ; Stride
pxor mm5, mm5 ; accum1
pxor mm6, mm6 ; accum2
SAD_16x16_SSE
SAD_16x16_SSE
SAD_16x16_SSE
SAD_16x16_SSE
SAD_16x16_SSE
SAD_16x16_SSE
SAD_16x16_SSE
SAD_16x16_SSE
SAD_16x16_SSE
SAD_16x16_SSE
SAD_16x16_SSE
SAD_16x16_SSE
SAD_16x16_SSE
SAD_16x16_SSE
SAD_16x16_SSE
SAD_16x16_SSE
paddusw mm6,mm5
movd eax, mm6
ret
;=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D
;
; uint32_t sad8_xmm(const uint8_t * const cur,
; const uint8_t * const ref,
; const uint32_t stride);
;
;=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D
%macro SAD_8x8_SSE 0
movq mm0, [eax]
movq mm1, [eax+ecx]
psadbw mm0, [edx]
psadbw mm1, [edx+ecx]
add eax, ebx
add edx, ebx
paddusw mm5,mm0
paddusw mm6,mm1
%endmacro
align 16
sad8_xmm:
mov eax, [esp+ 4] ; Src1
mov edx, [esp+ 8] ; Src2
mov ecx, [esp+12] ; Stride
push ebx
lea ebx, [ecx+ecx]
=20
pxor mm5, mm5 ; accum1
pxor mm6, mm6 ; accum2
SAD_8x8_SSE
SAD_8x8_SSE
SAD_8x8_SSE
movq mm0, [eax]
movq mm1, [eax+ecx]
psadbw mm0, [edx]
psadbw mm1, [edx+ecx]
pop ebx
paddusw mm5,mm0
paddusw mm6,mm1
paddusw mm6,mm5
movd eax, mm6
ret
;=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D
;
; uint32_t sad16bi_xmm(const uint8_t * const cur,
; const uint8_t * const ref1,
; const uint8_t * const ref2,
; const uint32_t stride);
;
;=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D
%macro SADBI_16x16_SSE 0
movq mm0, [eax]
movq mm1, [eax+8]
movq mm2, [edx]
movq mm3, [edx+8]
pavgb mm2, [ebx]
add edx, ecx
pavgb mm3, [ebx+8]
add ebx, ecx
psadbw mm0, mm2
add eax, ecx
psadbw mm1, mm3
paddusw mm5,mm0
paddusw mm6,mm1 =20
%endmacro
align 16
sad16bi_xmm:
push ebx
mov eax, [esp+4+ 4] ; Src
mov edx, [esp+4+ 8] ; Ref1
mov ebx, [esp+4+12] ; Ref2
mov ecx, [esp+4+16] ; Stride
pxor mm5, mm5 ; accum1
pxor mm6, mm6 ; accum2
SADBI_16x16_SSE
SADBI_16x16_SSE
SADBI_16x16_SSE
SADBI_16x16_SSE
SADBI_16x16_SSE
SADBI_16x16_SSE
SADBI_16x16_SSE
SADBI_16x16_SSE
SADBI_16x16_SSE
SADBI_16x16_SSE
SADBI_16x16_SSE
SADBI_16x16_SSE
SADBI_16x16_SSE
SADBI_16x16_SSE
SADBI_16x16_SSE
SADBI_16x16_SSE
paddusw mm6,mm5
movd eax, mm6
pop ebx
ret
;=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=20
;=20
; uint32_t sad8bi_xmm(const uint8_t * const cur,=20
; const uint8_t * const ref1,=20
; const uint8_t * const ref2,=20
; const uint32_t stride);=20
;=20
;=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=20
%macro SADBI_8x8_XMM 0=20
movq mm0, [eax]=20
movq mm1, [eax+ecx]=20
movq mm2, [edx]=20
movq mm3, [edx+ecx]=20
pavgb mm2, [ebx]=20
lea edx, [edx+2*ecx]=20
pavgb mm3, [ebx+ecx]=20
lea ebx, [ebx+2*ecx]=20
psadbw mm0, mm2=20
lea eax, [eax+2*ecx]=20
psadbw mm1, mm3=20
paddusw mm5,mm0=20
paddusw mm6,mm1=20
%endmacro=20
align 16=20
sad8bi_xmm:=20
push ebx=20
mov eax, [esp+4+ 4] ; Src=20
mov edx, [esp+4+ 8] ; Ref1=20
mov ebx, [esp+4+12] ; Ref2=20
mov ecx, [esp+4+16] ; Stride=20
pxor mm5, mm5 ; accum1=20
pxor mm6, mm6 ; accum2=20
.Loop=20
SADBI_8x8_XMM=20
SADBI_8x8_XMM=20
SADBI_8x8_XMM=20
SADBI_8x8_XMM=20
paddusw mm6,mm5=20
movd eax, mm6=20
pop ebx=20
ret=20
;=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D
;
; uint32_t dev16_xmm(const uint8_t * const cur,
; const uint32_t stride);
;
;=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D
%macro MEAN_16x16_SSE 0
movq mm0, [eax]
movq mm1, [eax+8]
psadbw mm0, mm7
psadbw mm1, mm7
add eax, ecx
paddw mm5, mm0=20
paddw mm6, mm1
%endmacro
=20
%macro ABS_16x16_SSE 0
movq mm0, [eax]
movq mm1, [eax+8]
psadbw mm0, mm4
psadbw mm1, mm4
lea eax,[eax+ecx]
paddw mm5, mm0
paddw mm6, mm1
%endmacro
align 16
dev16_xmm:
mov eax, [esp+ 4] ; Src
mov ecx, [esp+ 8] ; Stride
=20
pxor mm7, mm7 ; zero
pxor mm5, mm5 ; mean accums
pxor mm6, mm6
MEAN_16x16_SSE
MEAN_16x16_SSE
MEAN_16x16_SSE
MEAN_16x16_SSE
MEAN_16x16_SSE
MEAN_16x16_SSE
MEAN_16x16_SSE
MEAN_16x16_SSE
MEAN_16x16_SSE
MEAN_16x16_SSE
MEAN_16x16_SSE
MEAN_16x16_SSE
MEAN_16x16_SSE
MEAN_16x16_SSE
MEAN_16x16_SSE
MEAN_16x16_SSE
paddusw mm6, mm5
movq mm4, mm6
psllq mm4, 32
paddd mm4, mm6
psrld mm4, 8 ; /=3D (16*16)
packssdw mm4, mm4
packuswb mm4, mm4
; mm4 contains the mean
mov eax, [esp+ 4] ; Src
pxor mm5, mm5 ; sums
pxor mm6, mm6
ABS_16x16_SSE
ABS_16x16_SSE
ABS_16x16_SSE
ABS_16x16_SSE
ABS_16x16_SSE
ABS_16x16_SSE
ABS_16x16_SSE
ABS_16x16_SSE
ABS_16x16_SSE
ABS_16x16_SSE
ABS_16x16_SSE
ABS_16x16_SSE
ABS_16x16_SSE
ABS_16x16_SSE
ABS_16x16_SSE
ABS_16x16_SSE
paddusw mm6, mm5
movq mm7, mm6
psllq mm7, 32=20
paddd mm6, mm7
movd eax, mm6
ret
cglobal sad16v_xmm
;=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D
;int sad16v_xmm(const uint8_t * const cur,
; const uint8_t * const ref,
; const uint32_t stride,
; int* sad8);
;=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D
align 16
sad16v_xmm:
push ebx
mov eax, [esp+4+ 4] ; Src1
mov edx, [esp+4+ 8] ; Src2
mov ecx, [esp+4+12] ; Stride
mov ebx, [esp+4+16] ; sad ptr
pxor mm5, mm5 ; accum1
pxor mm6, mm6 ; accum2
pxor mm7, mm7 ; total
SAD_16x16_SSE
SAD_16x16_SSE
SAD_16x16_SSE
SAD_16x16_SSE
SAD_16x16_SSE
SAD_16x16_SSE
SAD_16x16_SSE
SAD_16x16_SSE
paddusw mm7, mm5
paddusw mm7, mm6
movd [ebx], mm5
movd [ebx+4], mm6
pxor mm5, mm5 ; accum1
pxor mm6, mm6 ; accum2
SAD_16x16_SSE
SAD_16x16_SSE
SAD_16x16_SSE
SAD_16x16_SSE
SAD_16x16_SSE
SAD_16x16_SSE
SAD_16x16_SSE
SAD_16x16_SSE
paddusw mm7, mm5
paddusw mm7, mm6
movd [ebx+8], mm5
movd [ebx+12], mm6
movd eax, mm7
pop ebx
ret
;--------
;=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D
;uint32_t sad32v_xmm(const uint8_t * const cur,=20
; const uint8_t * const ref,=20
; const uint32_t stride,=20
; int32_t *sad)
;{
; int i;
; unsigned int sad0 =3D 0;
; unsigned int sad1 =3D 0;
; unsigned int sad2 =3D 0;
; unsigned int sad3 =3D 0;
; uint8_t const *ptr_cur =3D cur;
; uint8_t const *ptr_ref =3D ref;
;
;#pragma unroll(8)
; for (i =3D 0; i < 8; i++, ptr_cur +=3D stride, ptr_ref +=3D stride)
; {
; /* first 8 pixels */
; __m64 src_high, ref_high, src_rhigh, ref_rhigh;
; __m64 src_low =3D ((__m64 *)ptr_cur)[0];
; __m64 ref_low =3D ((__m64 *)ptr_ref)[0];
; __m64 sad_mmx =3D _mm_sad_pu8(src_low, ref_low);
; /* this is added to sad0 */
; sad0 +=3D _mm_cvtsi64_si32(sad_mmx);
; /* second 8 pixels */
; src_high =3D ((__m64 *)ptr_cur)[1];
; ref_high =3D ((__m64 *)ptr_ref)[1];
; sad_mmx =3D _mm_sad_pu8(src_high, ref_high);
; /* this is added to both sad0 & sad1 */
; sad0 +=3D _mm_cvtsi64_si32(sad_mmx);
; sad1 +=3D _mm_cvtsi64_si32(sad_mmx);
; /* last 8 pixels */
; src_rhigh =3D ((__m64 *)ptr_cur)[2];
; ref_rhigh =3D ((__m64 *)ptr_ref)[2];
; sad_mmx =3D _mm_sad_pu8(src_rhigh, ref_rhigh);
; /* this is added to sad1 */
; sad1 +=3D _mm_cvtsi64_si32(sad_mmx);
; }
;
;#pragma unroll(8)
; for (i =3D 0; i < 8; i++, ptr_cur +=3D stride, ptr_ref +=3D stride)
; {
; /* first 8 pixels */
; __m64 src_high, ref_high, src_rhigh, ref_rhigh;
; __m64 src_low =3D ((__m64 *)ptr_cur)[0];
; __m64 ref_low =3D ((__m64 *)ptr_ref)[0];
; __m64 sad_mmx =3D _mm_sad_pu8(src_low, ref_low);
; /* this is added to sad0 and sad2 */
; sad0 +=3D _mm_cvtsi64_si32(sad_mmx);
; sad2 +=3D _mm_cvtsi64_si32(sad_mmx);
; /* second 8 pixels */
; src_high =3D ((__m64 *)ptr_cur)[1];
; ref_high =3D ((__m64 *)ptr_ref)[1];
; sad_mmx =3D _mm_sad_pu8(src_high, ref_high);
; /* this is added to both sad0, sad1, sad2 & sad3 */
; sad0 +=3D _mm_cvtsi64_si32(sad_mmx);
; sad1 +=3D _mm_cvtsi64_si32(sad_mmx);
; sad2 +=3D _mm_cvtsi64_si32(sad_mmx);
; sad3 +=3D _mm_cvtsi64_si32(sad_mmx);
; /* last 8 pixels */
; src_rhigh =3D ((__m64 *)ptr_cur)[2];
; ref_rhigh =3D ((__m64 *)ptr_ref)[2];
; sad_mmx =3D _mm_sad_pu8(src_rhigh, ref_rhigh);
; /* this is added to sad3 */
; sad3 +=3D _mm_cvtsi64_si32(sad_mmx);
; }
;
;#pragma unroll(8)
; for (i =3D 0; i < 8; i++, ptr_cur +=3D stride, ptr_ref +=3D stride)
; {
; /* first 8 pixels */
; __m64 src_high, ref_high, src_rhigh, ref_rhigh;
; __m64 src_low =3D ((__m64 *)ptr_cur)[0];
; __m64 ref_low =3D ((__m64 *)ptr_ref)[0];
; __m64 sad_mmx =3D _mm_sad_pu8(src_low, ref_low);
; /* this is added to sad2 */
; sad2 +=3D _mm_cvtsi64_si32(sad_mmx);
; /* second 8 pixels */
; src_high =3D ((__m64 *)ptr_cur)[1];
; ref_high =3D ((__m64 *)ptr_ref)[1];
; sad_mmx =3D _mm_sad_pu8(src_high, ref_high);
; /* this is added to both sad2 & sad3 */
; sad2 +=3D _mm_cvtsi64_si32(sad_mmx);
; sad3 +=3D _mm_cvtsi64_si32(sad_mmx);
; /* last 8 pixels */
; src_rhigh =3D ((__m64 *)ptr_cur)[2];
; ref_rhigh =3D ((__m64 *)ptr_ref)[2];
; sad_mmx =3D _mm_sad_pu8(src_rhigh, ref_rhigh);
; /* this is added to sad1 */
; sad3 +=3D _mm_cvtsi64_si32(sad_mmx);
; }
;
; /* set the result value */
; sad[0] =3D sad0;
; sad[1] =3D sad1;
; sad[2] =3D sad2;
; sad[3] =3D sad3;
;
; /* return the result */
; return sad0 + sad1 + sad2 + sad3;
;};
;=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D
cglobal sad32v_xmm
align 16
sad32v_xmm:
$LN373:
push edi ;315.1
push esi ;315.1
push ebp ;315.1
push ebx ;315.1
sub esp, 20 ;315.1
$LN374:
mov ecx, [esp+40] ;311.10
mov ebp, [esp+44] ;311.10
$LN375:
movq mm0, [ecx] ;329.29
$LN376:
movq mm7, [ecx+8] ;335.24
$LN377:
movq mm1, [ecx+16] ;342.25
$LN378:
psadbw mm0, [ebp] ;331.40
$LN379:
psadbw mm7, [ebp+8] ;337.35
$LN380:
psadbw mm1, [ebp+16] ;344.36
$LN381:
mov edx, [esp+48] ;325.26
$LN382:
lea edi, [ebp+edx] ;325.45
$LN383:
movd eax, mm0 ;333.28
$LN384:
movd esi, mm7 ;339.28
$LN385:
add eax, esi ;339.3
$LN386:
movd ebx, mm1 ;346.28
$LN387:
add ebx, esi ;346.3
$LN388:
lea esi, [ecx+edx] ;325.26
$LN389:
movq mm5, [esi] ;329.29
$LN390:
movq mm6, [esi+8] ;335.24
$LN391:
movq mm2, [esi+16] ;342.25
$LN392:
psadbw mm5, [edi] ;331.40
$LN393:
psadbw mm6, [edi+8] ;337.35
$LN394:
psadbw mm2, [edi+16] ;344.36
$LN395:
movd ecx, mm5 ;333.28
$LN396:
add eax, ecx ;333.3
$LN397:
add esi, edx ;325.26
$LN398:
movq mm3, [esi] ;329.29
$LN399:
movq mm4, [esi+8] ;335.24
$LN400:
movq mm0, [esi+16] ;342.25
$LN401:
add edi, edx ;325.45
$LN402:
psadbw mm3, [edi] ;331.40
$LN403:
psadbw mm4, [edi+8] ;337.35
$LN404:
psadbw mm0, [edi+16] ;344.36
$LN405:
add esi, edx ;325.26
$LN406:
add edi, edx ;325.45
$LN407:
movd ecx, mm6 ;339.28
$LN408:
add eax, ecx ;339.3
$LN409:
add ebx, ecx ;340.3
$LN410:
movd ecx, mm2 ;346.28
$LN411:
add ecx, ebx ;346.3
$LN412:
movd ebx, mm3 ;333.28
$LN413:
add eax, ebx ;333.3
$LN414:
movd ebx, mm4 ;339.28
$LN415:
add eax, ebx ;339.3
$LN416:
add ecx, ebx ;340.3
$LN417:
movd ebx, mm0 ;346.28
$LN418:
movq mm0, [esi] ;329.29
$LN419:
psadbw mm0, [edi] ;331.40
$LN420:
add ebx, ecx ;346.3
$LN421:
movd ecx, mm0 ;333.28
$LN422:
movq mm0, [esi+8] ;335.24
$LN423:
psadbw mm0, [edi+8] ;337.35
$LN424:
add eax, ecx ;333.3
$LN425:
movd ecx, mm0 ;339.28
$LN426:
movq mm0, [esi+16] ;342.25
$LN427:
psadbw mm0, [edi+16] ;344.36
$LN428:
add eax, ecx ;339.3
$LN429:
add ebx, ecx ;340.3
$LN430:
add esi, edx ;325.26
$LN431:
add edi, edx ;325.45
$LN432:
movd ecx, mm0 ;346.28
$LN433:
movq mm0, [esi] ;329.29
$LN434:
psadbw mm0, [edi] ;331.40
$LN435:
add ecx, ebx ;346.3
$LN436:
movd ebx, mm0 ;333.28
$LN437:
movq mm0, [esi+8] ;335.24
$LN438:
psadbw mm0, [edi+8] ;337.35
$LN439:
add eax, ebx ;333.3
$LN440:
movd ebx, mm0 ;339.28
$LN441:
movq mm0, [esi+16] ;342.25
$LN442:
psadbw mm0, [edi+16] ;344.36
$LN443:
add eax, ebx ;339.3
$LN444:
add ecx, ebx ;340.3
$LN445:
add esi, edx ;325.26
$LN446:
add edi, edx ;325.45
$LN447:
movd ebp, mm0 ;346.28
$LN448:
movq mm0, [esi] ;329.29
$LN449:
psadbw mm0, [edi] ;331.40
$LN450:
add ebp, ecx ;346.3
$LN451:
movd ecx, mm0 ;333.28
$LN452:
movq mm0, [esi+8] ;335.24
$LN453:
psadbw mm0, [edi+8] ;337.35
$LN454:
add eax, ecx ;333.3
$LN455:
movd ecx, mm0 ;339.28
$LN456:
movq mm0, [esi+16] ;342.25
$LN457:
psadbw mm0, [edi+16] ;344.36
$LN458:
add eax, ecx ;339.3
$LN459:
add ebp, ecx ;340.3
$LN460:
add esi, edx ;325.26
$LN461:
add edi, edx ;325.45
$LN462:
movd ebx, mm0 ;346.28
$LN463:
movq mm0, [esi] ;329.29
$LN464:
psadbw mm0, [edi] ;331.40
$LN465:
add ebx, ebp ;346.3
$LN466:
movd ecx, mm0 ;333.28
$LN467:
movq mm0, [esi+8] ;335.24
$LN468:
psadbw mm0, [edi+8] ;337.35
$LN469:
add eax, ecx ;333.3
$LN470:
movd ecx, mm0 ;339.28
$LN471:
movq mm0, [esi+16] ;342.25
$LN472:
psadbw mm0, [edi+16] ;344.36
$LN473:
add eax, ecx ;339.3
$LN474:
add ebx, ecx ;340.3
$LN475:
add esi, edx ;325.26
$LN476:
add edi, edx ;325.45
$LN477:
movd ecx, mm0 ;346.28
$LN478:
movq mm0, [esi] ;329.29
$LN479:
psadbw mm0, [edi] ;331.40
$LN480:
add ecx, ebx ;346.3
$LN481:
movd ebx, mm0 ;333.28
$LN482:
movq mm0, [esi+8] ;335.24
$LN483:
psadbw mm0, [edi+8] ;337.35
$LN484:
add eax, ebx ;333.3
$LN485:
movd ebx, mm0 ;339.28
$LN486:
movq mm0, [esi+16] ;342.25
$LN487:
psadbw mm0, [edi+16] ;344.36
$LN488:
add eax, ebx ;339.3
$LN489:
add ecx, ebx ;340.3
$LN490:
movd ebx, mm0 ;346.28
$LN491:
add ebx, ecx ;346.3
mov [esp], ebx ;346.3
$LN492:
lea ecx, [edx+esi] ;325.26
$LN493:
lea edx, [edx+edi] ;325.45
; LOE eax edx ecx ebx esi edi bl bh
.B9.3: ; Preds .B9.2
$LN494:
mov ebp, ebx ;366.3
mov [esp+8], edx ;
$LN495:
mov edx, [esp+48] ;354.29
movq mm2, [esi+edx] ;354.29
$LN496:
movq mm3, [esi+edx+8] ;361.24
$LN497:
psadbw mm2, [edi+edx] ;356.40
$LN498:
psadbw mm3, [edi+edx+8] ;363.35
$LN499:
movd ebx, mm2 ;358.28
$LN500:
add eax, ebx ;358.3
$LN501:
movd edx, mm3 ;365.28
mov [esp+4], edx ;365.28
$LN502:
add eax, edx ;365.3
$LN503:
add ebp, edx ;366.3
$LN504:
mov [esp+12], eax ;365.3
$LN505:
mov eax, [esp+48] ;370.25
movq mm1, [esi+eax+16] ;370.25
$LN506:
psadbw mm1, [edi+eax+16] ;372.36
$LN507:
add ebx, edx ;367.3
$LN508:
movd edi, mm1 ;374.28
$LN509:
add edi, [esp+4] ;374.3
$LN510:
mov eax, [esp+48] ;350.26
add ecx, eax ;350.26
$LN511:
movq mm0, [ecx] ;354.29
$LN512:
movq mm7, [ecx+8] ;361.24
$LN513:
movq mm4, [ecx+16] ;370.25
$LN514:
mov edx, [esp+8] ;350.45
add edx, eax ;350.45
$LN515:
psadbw mm0, [edx] ;356.40
$LN516:
psadbw mm7, [edx+8] ;363.35
$LN517:
psadbw mm4, [edx+16] ;372.36
$LN518:
mov eax, [esp+12] ;358.3
$LN519:
movd esi, mm0 ;358.28
$LN520:
add eax, esi ;358.3
$LN521:
add ebx, esi ;359.3
$LN522:
movd esi, mm7 ;365.28
$LN523:
add eax, esi ;365.3
$LN524:
add ebp, esi ;366.3
$LN525:
add ebx, esi ;367.3
$LN526:
add edi, esi ;368.3
$LN527:
movd esi, mm4 ;374.28
$LN528:
add esi, edi ;374.3
$LN529:
mov edi, [esp+48] ;350.26
add ecx, edi ;350.26
$LN530:
movq mm6, [ecx] ;354.29
$LN531:
movq mm5, [ecx+8] ;361.24
$LN532:
movq mm0, [ecx+16] ;370.25
$LN533:
add edx, edi ;350.45
$LN534:
psadbw mm6, [edx] ;356.40
$LN535:
psadbw mm5, [edx+8] ;363.35
$LN536:
psadbw mm0, [edx+16] ;372.36
$LN537:
movd edi, mm6 ;358.28
$LN538:
add eax, edi ;358.3
$LN539:
add ebx, edi ;359.3
$LN540:
movd edi, mm5 ;365.28
$LN541:
add eax, edi ;365.3
$LN542:
add ebp, edi ;366.3
$LN543:
add ebx, edi ;367.3
$LN544:
add esi, edi ;368.3
$LN545:
movd edi, mm0 ;374.28
$LN546:
add edi, esi ;374.3
$LN547:
mov esi, [esp+48] ;350.26
add ecx, esi ;350.26
$LN548:
movq mm0, [ecx] ;354.29
$LN549:
add edx, esi ;350.45
$LN550:
psadbw mm0, [edx] ;356.40
$LN551:
movd esi, mm0 ;358.28
$LN552:
movq mm0, [ecx+8] ;361.24
$LN553:
psadbw mm0, [edx+8] ;363.35
$LN554:
add eax, esi ;358.3
$LN555:
add ebx, esi ;359.3
$LN556:
movd esi, mm0 ;365.28
$LN557:
movq mm0, [ecx+16] ;370.25
$LN558:
psadbw mm0, [edx+16] ;372.36
$LN559:
add eax, esi ;365.3
$LN560:
add ebp, esi ;366.3
$LN561:
add ebx, esi ;367.3
$LN562:
add edi, esi ;368.3
$LN563:
movd esi, mm0 ;374.28
$LN564:
add esi, edi ;374.3
$LN565:
mov edi, [esp+48] ;350.26
add ecx, edi ;350.26
$LN566:
movq mm0, [ecx] ;354.29
$LN567:
add edx, edi ;350.45
$LN568:
psadbw mm0, [edx] ;356.40
$LN569:
movd edi, mm0 ;358.28
$LN570:
movq mm0, [ecx+8] ;361.24
$LN571:
psadbw mm0, [edx+8] ;363.35
$LN572:
add eax, edi ;358.3
$LN573:
add ebx, edi ;359.3
$LN574:
movd edi, mm0 ;365.28
$LN575:
movq mm0, [ecx+16] ;370.25
$LN576:
psadbw mm0, [edx+16] ;372.36
$LN577:
add eax, edi ;365.3
$LN578:
add ebp, edi ;366.3
$LN579:
add ebx, edi ;367.3
$LN580:
add esi, edi ;368.3
$LN581:
movd edi, mm0 ;374.28
$LN582:
add edi, esi ;374.3
$LN583:
mov esi, [esp+48] ;350.26
add ecx, esi ;350.26
$LN584:
movq mm0, [ecx] ;354.29
$LN585:
add edx, esi ;350.45
$LN586:
psadbw mm0, [edx] ;356.40
$LN587:
movd esi, mm0 ;358.28
$LN588:
movq mm0, [ecx+8] ;361.24
$LN589:
psadbw mm0, [edx+8] ;363.35
$LN590:
add eax, esi ;358.3
$LN591:
add ebx, esi ;359.3
$LN592:
movd esi, mm0 ;365.28
$LN593:
movq mm0, [ecx+16] ;370.25
$LN594:
psadbw mm0, [edx+16] ;372.36
$LN595:
add eax, esi ;365.3
$LN596:
add ebp, esi ;366.3
$LN597:
add ebx, esi ;367.3
$LN598:
add edi, esi ;368.3
$LN599:
movd esi, mm0 ;374.28
$LN600:
add esi, edi ;374.3
$LN601:
mov edi, [esp+48] ;350.26
add ecx, edi ;350.26
$LN602:
movq mm0, [ecx] ;354.29
$LN603:
add edx, edi ;350.45
$LN604:
psadbw mm0, [edx] ;356.40
$LN605:
movd edi, mm0 ;358.28
$LN606:
movq mm0, [ecx+8] ;361.24
$LN607:
psadbw mm0, [edx+8] ;363.35
$LN608:
add eax, edi ;358.3
$LN609:
add ebx, edi ;359.3
$LN610:
movd edi, mm0 ;365.28
$LN611:
movq mm0, [ecx+16] ;370.25
$LN612:
psadbw mm0, [edx+16] ;372.36
$LN613:
add eax, edi ;365.3
$LN614:
add ebp, edi ;366.3
$LN615:
add ebx, edi ;367.3
$LN616:
add esi, edi ;368.3
$LN617:
movd edi, mm0 ;374.28
$LN618:
add edi, esi ;374.3
$LN619:
mov esi, [esp+48] ;350.26
add ecx, esi ;350.26
$LN620:
movq mm0, [ecx] ;354.29
$LN621:
add edx, esi ;350.45
$LN622:
psadbw mm0, [edx] ;356.40
$LN623:
movd esi, mm0 ;358.28
$LN624:
movq mm0, [ecx+8] ;361.24
$LN625:
psadbw mm0, [edx+8] ;363.35
$LN626:
add eax, esi ;358.3
$LN627:
add ebx, esi ;359.3
$LN628:
movd esi, mm0 ;365.28
$LN629:
movq mm0, [ecx+16] ;370.25
$LN630:
psadbw mm0, [edx+16] ;372.36
$LN631:
add eax, esi ;365.3
$LN632:
add ebp, esi ;366.3
mov [esp], ebp ;366.3
$LN633:
add ebx, esi ;367.3
$LN634:
add edi, esi ;368.3
$LN635:
movd ebp, mm0 ;374.28
$LN636:
add ebp, edi ;374.3
$LN637:
mov esi, [esp+48] ;350.26
$LN638:
mov [esp+4], ebp ;374.3
$LN639:
lea ebp, [esi+ecx] ;350.26
$LN640:
lea esi, [esi+edx] ;350.45
; LOE eax edx ecx ebx ebp esi
.B9.4: ; Preds .B9.3
$LN641:
mov edi, [esp+48] ;382.29
movq mm7, [ecx+edi] ;382.29
$LN642:
movq mm1, [ecx+edi+8] ;388.24
$LN643:
movq mm2, [ecx+edi+16] ;395.25
$LN644:
psadbw mm7, [edx+edi] ;384.40
$LN645:
psadbw mm1, [edx+edi+8] ;390.35
$LN646:
psadbw mm2, [edx+edi+16] ;397.36
mov [esp+12], eax ;
$LN647:
add ebp, edi ;378.26
$LN648:
movq mm0, [ebp] ;382.29
$LN649:
movq mm3, [ebp+8] ;388.24
$LN650:
movq mm4, [ebp+16] ;395.25
$LN651:
add esi, edi ;378.45
$LN652:
psadbw mm0, [esi] ;384.40
$LN653:
psadbw mm3, [esi+8] ;390.35
$LN654:
psadbw mm4, [esi+16] ;397.36
$LN655:
add ebp, edi ;378.26
$LN656:
movq mm5, [ebp] ;382.29
$LN657:
movq mm6, [ebp+8] ;388.24
$LN658:
add esi, edi ;378.45
$LN659:
psadbw mm5, [esi] ;384.40
$LN660:
psadbw mm6, [esi+8] ;390.35
$LN661:
movd eax, mm7 ;386.28
$LN662:
add ebx, eax ;386.3
$LN663:
movd eax, mm1 ;392.28
$LN664:
add ebx, eax ;392.3
mov [esp+16], ebx ;392.3
$LN665:
mov ebx, [esp+4] ;393.3
$LN666:
mov edx, [esp+16] ;386.3
$LN667:
add ebx, eax ;393.3
$LN668:
movd ecx, mm2 ;399.28
$LN669:
add ebx, ecx ;399.3
$LN670:
movd eax, mm0 ;386.28
$LN671:
movq mm0, [ebp+16] ;395.25
$LN672:
psadbw mm0, [esi+16] ;397.36
$LN673:
mov ecx, [esp+12] ;403.2
$LN674:
add edx, eax ;386.3
$LN675:
add ebp, edi ;378.26
$LN676:
add esi, edi ;378.45
$LN677:
movd eax, mm3 ;392.28
$LN678:
add edx, eax ;392.3
$LN679:
add ebx, eax ;393.3
$LN680:
movd eax, mm4 ;399.28
$LN681:
add ebx, eax ;399.3
$LN682:
movd eax, mm5 ;386.28
$LN683:
add edx, eax ;386.3
$LN684:
movd eax, mm6 ;392.28
$LN685:
add edx, eax ;392.3
$LN686:
add ebx, eax ;393.3
$LN687:
movd eax, mm0 ;399.28
$LN688:
movq mm0, [ebp] ;382.29
$LN689:
psadbw mm0, [esi] ;384.40
$LN690:
add ebx, eax ;399.3
$LN691:
movd eax, mm0 ;386.28
$LN692:
movq mm0, [ebp+8] ;388.24
$LN693:
psadbw mm0, [esi+8] ;390.35
$LN694:
add edx, eax ;386.3
$LN695:
movd eax, mm0 ;392.28
$LN696:
movq mm0, [ebp+16] ;395.25
$LN697:
psadbw mm0, [esi+16] ;397.36
$LN698:
add edx, eax ;392.3
$LN699:
add ebx, eax ;393.3
$LN700:
add ebp, edi ;378.26
$LN701:
add esi, edi ;378.45
$LN702:
movd eax, mm0 ;399.28
$LN703:
movq mm0, [ebp] ;382.29
$LN704:
psadbw mm0, [esi] ;384.40
$LN705:
add ebx, eax ;399.3
$LN706:
movd eax, mm0 ;386.28
$LN707:
movq mm0, [ebp+8] ;388.24
$LN708:
psadbw mm0, [esi+8] ;390.35
$LN709:
add edx, eax ;386.3
$LN710:
movd eax, mm0 ;392.28
$LN711:
movq mm0, [ebp+16] ;395.25
$LN712:
psadbw mm0, [esi+16] ;397.36
$LN713:
add edx, eax ;392.3
$LN714:
add ebx, eax ;393.3
$LN715:
add ebp, edi ;378.26
$LN716:
add esi, edi ;378.45
$LN717:
movd eax, mm0 ;399.28
$LN718:
movq mm0, [ebp] ;382.29
$LN719:
psadbw mm0, [esi] ;384.40
$LN720:
add ebx, eax ;399.3
$LN721:
movd eax, mm0 ;386.28
$LN722:
movq mm0, [ebp+8] ;388.24
$LN723:
psadbw mm0, [esi+8] ;390.35
$LN724:
add edx, eax ;386.3
$LN725:
movd eax, mm0 ;392.28
$LN726:
movq mm0, [ebp+16] ;395.25
$LN727:
psadbw mm0, [esi+16] ;397.36
$LN728:
add edx, eax ;392.3
$LN729:
add ebx, eax ;393.3
$LN730:
add ebp, edi ;378.26
$LN731:
add esi, edi ;378.45
$LN732:
movd eax, mm0 ;399.28
$LN733:
movq mm0, [ebp] ;382.29
$LN734:
psadbw mm0, [esi] ;384.40
$LN735:
add ebx, eax ;399.3
$LN736:
movd eax, mm0 ;386.28
$LN737:
movq mm0, [ebp+8] ;388.24
$LN738:
psadbw mm0, [esi+8] ;390.35
$LN739:
add edx, eax ;386.3
$LN740:
movd eax, mm0 ;392.28
$LN741:
movq mm0, [ebp+16] ;395.25
$LN742:
psadbw mm0, [esi+16] ;397.36
$LN743:
add edx, eax ;392.3
$LN744:
add ebx, eax ;393.3
$LN745:
movd eax, mm0 ;399.28
$LN746:
movq mm0, [ebp+edi] ;382.29
$LN747:
psadbw mm0, [esi+edi] ;384.40
$LN748:
add ebx, eax ;399.3
$LN749:
movd eax, mm0 ;386.28
$LN750:
movq mm0, [ebp+edi+8] ;388.24
$LN751:
psadbw mm0, [esi+edi+8] ;390.35
$LN752:
add edx, eax ;386.3
$LN753:
movd eax, mm0 ;392.28
$LN754:
movq mm0, [ebp+edi+16] ;395.25
$LN755:
psadbw mm0, [esi+edi+16] ;397.36
$LN756:
add edx, eax ;392.3
$LN757:
mov esi, [esp+52] ;403.2
$LN758:
add ebx, eax ;393.3
$LN759:
mov eax, [esp] ;404.2
$LN760:
mov [esi], ecx ;403.2
$LN761:
mov [esi+4], eax ;404.2
$LN762:
movd ebp, mm0 ;399.28
$LN763:
add ebx, ebp ;399.3
$LN764:
mov [esi+8], edx ;405.2
$LN765:
mov [esi+12], ebx ;406.2
$LN766:
add ecx, eax ;409.16
$LN767:
add ecx, edx ;409.23
$LN768:
add ecx, ebx ;409.30
mov eax, ecx ;409.30
add esp, 20 ;409.30
pop ebx ;409.30
pop ebp ;409.30
pop esi ;409.30
pop edi ;409.30
ret ;409.30
;--------
------=_NextPart_000_0003_01C2A539.3BEEF470--