[XviD-devel] dynamic hpel/qpel decision
skal
xvid-devel@xvid.org
04 Dec 2002 11:51:43 +0100
--=-WCokwskIzD+8NsdgrdDX
Content-Type: text/plain
Content-Transfer-Encoding: 7bit
Hi Radek and all,
On Tue, 2002-12-03 at 16:59, skal wrote:
> > On Mon, 2002-12-02 at 12:10, Radek Czyz wrote:
> >
> > > PS does someone feel like writing an asm-ed Sobel edge detection? It's
> > > not slow the way it is (copied from RefDivX's adaptive quantization
> > > code) but can always be faster.
> I've got some C-code here, for basic filtering.
> The corresponding MMX version has a little bug I
> need to find time to fix.
That was worth it! I found: some bugs (as usual),
some incoherences (rounding for gradx/grady),
and some decent speedups (mainly in C).
So here are some more reliable versions.
bye,
Skal
PS: i've also included the code (commented) for using a threshold
with the norm of gradient. Look around 'THRESH' and 'Thresh',
respectively...
--=-WCokwskIzD+8NsdgrdDX
Content-Disposition: attachment; filename=filter.c
Content-Transfer-Encoding: quoted-printable
Content-Type: text/x-c; name=filter.c; charset=ISO-8859-1
/**************************************************************************=
***
*
* XVID MPEG-4 VIDEO CODEC
* 4x4 filtering utilities
*
* Copyright(C) 2002 Pascal Massimino <skal@planet-d.net>
*
* This file is part of XviD, a free MPEG-4 video encoder/decoder
*
* XviD is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 US=
A
*
* Under section 8 of the GNU General Public License, the copyright
* holders of XVID explicitly forbid distribution in the following
* countries:
*
* - Japan
* - United States of America
*
* Linking XviD statically or dynamically with other modules is making a
* combined work based on XviD. Thus, the terms and conditions of the
* GNU General Public License cover the whole combination.
*
* As a special exception, the copyright holders of XviD give you
* permission to link XviD with independent modules that communicate with
* XviD solely through the VFW1.1 and DShow interfaces, regardless of the
* license terms of these independent modules, and to copy and distribute
* the resulting combined work under terms of your choice, provided that
* every copy of the combined work is accompanied by a complete copy of
* the source code of XviD (the version of XviD used to produce the
* combined work), being distributed under the terms of the GNU General
* Public License plus this exception. An independent module is a module
* which is not derived from or based on XviD.
*
* Note that people who make modified versions of XviD are not obligated
* to grant this special exception for their modified versions; it is
* their choice whether to do so. The GNU General Public License gives
* permission to release a modified version without this exception; this
* exception also makes it possible to release a modified version which
* carries forward this exception.
*
* $Id: filter.c,v 1.11 2002/11/17 00:32:06 edgomez Exp $
*
**************************************************************************=
**/
/* prototypes */
#include <stdint.h>
/* C version */
extern void xvid_Gradx_18x18_To_8x8_C(int8_t *Dst, int32_t Dst_BpS,
const uint8_t *Src, int32_t Src_BpS);
extern void xvid_Grady_18x18_To_8x8_C(int8_t *Dst, int32_t Dst_BpS,
const uint8_t *Src, int32_t Src_BpS);
extern void xvid_Grad2_18x18_To_8x8_C(uint8_t *Dst, int32_t Dst_BpS,
const uint8_t *Src, int32_t Src_BpS);
extern void xvid_Smooth_18x18_To_8x8_C(uint8_t *Dst, int32_t Dst_BpS,
const uint8_t *Src, int32_t Src_BpS)=
;
/* MMX version */
extern void xvid_Gradx_18x18_To_8x8_mmx(int8_t *Dst, int32_t Dst_BpS,
const uint8_t *Src, int32_t Src_BpS=
);
extern void xvid_Grady_18x18_To_8x8_mmx(int8_t *Dst, int32_t Dst_BpS,
const uint8_t *Src, int32_t Src_BpS=
);
extern void xvid_Grad2_18x18_To_8x8_mmx(uint8_t *Dst, int32_t Dst_BpS,
const uint8_t *Src, int32_t Src_BpS=
);
extern void xvid_Smooth_18x18_To_8x8_mmx(uint8_t *Dst, int32_t Dst_BpS,
const uint8_t *Src, int32_t Src_Bp=
S);
/*//////////////////////////////////////////////////////////
// Downsampling 4x4 filters:
//
// [1 3 3 1] [-1 -3 3 1] [-1 -3 -3 -1]
// Smooth:[3 9 9 3] Gx:[-3 -9 9 3] Gy:[-3 -9 -9 -3]
// [3 9 9 3] [-3 -9 9 3] [ 3 9 9 3]
// [1 3 3 1] [-1 -3 3 1] [ 1 3 3 1]
//
// Input:18x18 Output:8x8
//////////////////////////////////////////////////////////*/
#define STORE(x) *d =3D (x); d +=3D Dst_BpS
void xvid_Smooth_18x18_To_8x8_C(uint8_t *Dst, int32_t Dst_BpS,
const uint8_t *Src, int32_t Src_BpS)
{
#define LOAD(x) (x) =3D 3*(s[1]+s[0]) +s[2]+s[-1]; s +=3D Src_BpS
int i;
Src -=3D Src_BpS;
for(i=3D0; i<8; ++i)
{
int32_t mx0, mx1, tmp;
int j;
const uint8_t *s =3D Src + 2*i;
uint8_t *d =3D Dst + i;
LOAD(mx0);
LOAD(tmp);
mx0 +=3D 3*tmp;
for(j=3D4; j>0; --j) {
LOAD(mx1); mx0 +=3D 3*mx1;
LOAD(tmp); mx0 +=3D tmp; mx1 +=3D 3*tmp;
STORE( (32+mx0)>>6 );
LOAD(mx0); mx1 +=3D 3*mx0;
LOAD(tmp); mx1 +=3D tmp; mx0 +=3D 3*tmp;
STORE( (32+mx1)>>6 );
}
}
#undef LOAD
}
void xvid_Gradx_18x18_To_8x8_C(int8_t *Dst, int32_t Dst_BpS,
const uint8_t *Src, int32_t Src_BpS)
{
#define LOAD(x) (x) =3D 3*(s[1]-s[0]) +s[2]-s[-1]; s +=3D Src_BpS
int i;
Src -=3D Src_BpS;
for(i=3D0; i<8; ++i)
{
int32_t mx0, mx1, tmp;
int j;
const uint8_t *s =3D Src + 2*i;
int8_t *d =3D Dst + i;
LOAD(mx0);
LOAD(tmp);
mx0 +=3D 3*tmp;
for(j=3D4; j>0; --j)
{
LOAD(mx1); mx0 +=3D 3*mx1;
LOAD(tmp); mx0 +=3D tmp;
mx1 +=3D 3*tmp;
STORE( (64+mx0)>>7 );
LOAD(mx0); mx1 +=3D 3*mx0;
LOAD(tmp); mx1 +=3D tmp;
mx0 +=3D 3*tmp;
STORE( (64+mx1)>>7 );
}
}
#undef LOAD
}
void xvid_Grady_18x18_To_8x8_C(int8_t *Dst, int32_t Dst_BpS,
const uint8_t *Src, int32_t Src_BpS)
{
#define LOAD(x) (x) =3D 3*(s[1]+s[0]) +s[2]+s[-1]; s +=3D Src_BpS
int i;
Src -=3D Src_BpS;
for(i=3D0; i<8; ++i)
{
int32_t mx0, mx1, tmp;
int j;
const uint8_t *s =3D Src + 2*i;
int8_t *d =3D Dst + i;
LOAD(mx0);
LOAD(tmp);
mx0 +=3D 3*tmp;
for(j=3D4; j>0; --j)
{
LOAD(mx1); mx0 -=3D 3*mx1;
LOAD(tmp); mx0 -=3D tmp;
mx1 +=3D 3*tmp;
STORE( (64-mx0)>>7 );
LOAD(mx0); mx1 -=3D 3*mx0;
LOAD(tmp); mx1 -=3D tmp;
mx0 +=3D 3*tmp;
STORE( (64-mx1)>>7 );
}
}
#undef LOAD
}
void xvid_Grad2_18x18_To_8x8_C(uint8_t *Dst, int32_t Dst_BpS,
const uint8_t *Src, int32_t Src_BpS)
{
#define LOAD(x,y) (x) =3D s[-1] + 3*s[0]; (y) =3D 3*s[1] + s[2]; s +=3D Src=
_BpS
#define THRESH 24
int i;
Src -=3D Src_BpS;
for(i=3D0; i<8; ++i)
{
int32_t mx0, mx1, my0, my1, tmpx, tmpy;
int j;
const uint8_t *s =3D Src + 2*i;
uint8_t *d =3D Dst + i;
LOAD(mx0,my0);=20
LOAD(tmpx,tmpy);
mx0 +=3D 3*tmpx; my0 +=3D 3*tmpy;
=20
for(j=3D4; j>0; --j)
{
LOAD(mx1,my1); mx0 -=3D 3*my1; my0 -=3D 3*mx1;
LOAD(tmpx,tmpy); mx0 -=3D tmpy; my0 -=3D tmpx;
mx1 +=3D 3*tmpx; my1 +=3D 3*tmpy;
/* at this point:=20
Gx =3D (64+mx0-my0)>>7
Gy =3D (64-mx0-my0)>>7
=3D> Gx*Gx+Gy*Gy ~=3D 2*( mx0*mx0 + my0*my0 )
*/
mx0 =3D (mx0+32)>>6; my0 =3D (my0+32)>>6;
tmpx =3D mx0*mx0 + my0*my0;
tmpx =3D (tmpx>255 ? 255 : tmpx);
/* tmpx =3D (tmpx>THRESH ? 255 : 0); */
STORE( tmpx );
LOAD(mx0,my0); mx1 -=3D 3*my0; my1 -=3D 3*mx0;
LOAD(tmpx,tmpy); mx1 -=3D tmpy; my1 -=3D tmpx;
mx0 +=3D 3*tmpx; my0 +=3D 3*tmpy;
mx1 =3D (mx1+32)>>6; my1 =3D (my1+32)>>6;
tmpx =3D mx1*mx1 + my1*my1;
tmpx =3D (tmpx>255 ? 255 : tmpx);
/* tmpx =3D (tmpx>THRESH ? 255 : 0); */
STORE( tmpx );
}
}
#undef THRESH
#undef LOAD
}
#undef STORE
/*//////////////////////////////////////////////////////////*/
--=-WCokwskIzD+8NsdgrdDX
Content-Disposition: attachment; filename=filter_mmx.asm
Content-Transfer-Encoding: quoted-printable
Content-Type: text/plain; name=filter_mmx.asm; charset=ISO-8859-1
;/*************************************************************************=
****
; *
; * XVID MPEG-4 VIDEO CODEC
; * 4x4 filtering utilities
; *
; * Copyright(C) 2002 Pascal Massimino <skal@planet-d.net>
; *
; * This file is part of XviD, a free MPEG-4 video encoder/decoder
; *
; * XviD is free software; you can redistribute it and/or modify it
; * under the terms of the GNU General Public License as published by
; * the Free Software Foundation; either version 2 of the License, or
; * (at your option) any later version.
; *
; * This program is distributed in the hope that it will be useful,
; * but WITHOUT ANY WARRANTY; without even the implied warranty of
; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
; * GNU General Public License for more details.
; *
; * You should have received a copy of the GNU General Public License
; * along with this program; if not, write to the Free Software
; * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 U=
SA
; *
; * Under section 8 of the GNU General Public License, the copyright
; * holders of XVID explicitly forbid distribution in the following
; * countries:
; *
; * - Japan
; * - United States of America
; *
; * Linking XviD statically or dynamically with other modules is making a
; * combined work based on XviD. Thus, the terms and conditions of the
; * GNU General Public License cover the whole combination.
; *
; * As a special exception, the copyright holders of XviD give you
; * permission to link XviD with independent modules that communicate with
; * XviD solely through the VFW1.1 and DShow interfaces, regardless of the
; * license terms of these independent modules, and to copy and distribute
; * the resulting combined work under terms of your choice, provided that
; * every copy of the combined work is accompanied by a complete copy of
; * the source code of XviD (the version of XviD used to produce the
; * combined work), being distributed under the terms of the GNU General
; * Public License plus this exception. An independent module is a module
; * which is not derived from or based on XviD.
; *
; * Note that people who make modified versions of XviD are not obligated
; * to grant this special exception for their modified versions; it is
; * their choice whether to do so. The GNU General Public License gives
; * permission to release a modified version without this exception; this
; * exception also makes it possible to release a modified version which
; * carries forward this exception.
; *
; * $Id: filter_mmx.asm,v 1.5 2002/11/17 00:32:06 edgomez Exp $
; *
; *************************************************************************=
/
bits 32
%macro cglobal 1=20
%ifdef PREFIX
global _%1=20
%define %1 _%1
%else
global %1
%endif
%endmacro
cglobal xvid_Smooth_18x18_To_8x8_mmx
cglobal xvid_Gradx_18x18_To_8x8_mmx
cglobal xvid_Grady_18x18_To_8x8_mmx
cglobal xvid_Grad2_18x18_To_8x8_mmx
;=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D
section .data
align 16
Cst3 dw 3, 3, 3, 3
Cst32 dw 32,32,32,32
Cst64 dw 64,64,64,64
Mask_ff dw 0xff,0xff,0xff,0xff
Thresh dw 24,24,24,24
;=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D
section .text
;//////////////////////////////////////////////////////////////////////
;// 18x18 -> 8x8 block filtering
;//////////////////////////////////////////////////////////////////////
%macro ADD_TIMES 5 ; (%1) =3D (%1)+(%5)*(%3) (%2) =3D (%2)+(%5)*(%4)
pmullw %3, %5
pmullw %4, %5
paddsw %1, %3
paddsw %2, %4
%endmacro
%macro SUB_TIMES 5 ; (%1) =3D (%1)-3*(%3) (%2) =3D (%2)-(%5)*(%4)
pmullw %3, %5
pmullw %4, %5
psubsw %1, %3
psubsw %2, %4
%endmacro
%macro LOAD_G2 3 ;%1-%2: Dst, %3:src mm6-mm7: trashed
movq mm6, [Mask_ff]
movq %1, [%3] ; 01234567
movq %2, [%3+1] ; 12345678
movq mm7, %2
psrlq %2, 8
pand %1, mm6 ; 0|2|4|6
pand mm7, mm6 ; 1|3|5|7
pand %2, mm6 ; 2|4|6|8
pand mm6, [%3-1] ;-1|1|3|5
pmullw mm7, [Cst3]
pmullw %1, [Cst3]
paddsw %2, mm7
paddsw %1, mm6
%endmacro
%macro MIX_G2 2 ; %1-2: regs. output:%1
paddsw %1, [Cst32]
paddsw %2, [Cst32]
psraw %1, 6
psraw %2, 6
pmullw %1, %1
pmullw %2, %2
paddusw %1, %2
; uncomment the following instr for thresholding:
; pcmpgtw %1, [Thresh]
; and change 'packuswb' into 'packsswb' in STORE_G2 below
%endmacro
%macro STORE_G2 2 ; %1:op type (0:tmp store, 1:final pack), %2:src offset
LOAD_G2 mm2,mm3, edx+%2
movq mm4,mm2
movq mm5,mm3
SUB_TIMES mm0,mm1, mm5,mm4, [Cst3]
LOAD_G2 mm4,mm5, edx+eax+%2
lea edx, [edx+2*eax]
psubsw mm0, mm5
psubsw mm1, mm4
ADD_TIMES mm2,mm3, mm4,mm5, [Cst3]
MIX_G2 mm0,mm1
%if (%1=3D=3D1)
packuswb mm0, [ecx]
; packsswb mm0, [ecx] ; <=3D use instead, for threshold
%endif
movq [ecx], mm0
LOAD_G2 mm0, mm1, edx+%2
movq mm4,mm0
movq mm5,mm1
SUB_TIMES mm2,mm3, mm5,mm4, [Cst3]
LOAD_G2 mm4,mm5, edx+eax+%2
lea edx, [edx+2*eax]
psubsw mm2, mm5
psubsw mm3, mm4
ADD_TIMES mm0,mm1, mm4,mm5, [Cst3]
MIX_G2 mm2,mm3
%if (%1=3D=3D1)
packuswb mm2, [ecx+ebx]
; packsswb mm2, [ecx+ebx] ; <=3D use instead, for threshold
%endif
movq [ecx+ebx], mm2
lea ecx, [ecx+2*ebx]
%endmacro
align 16
xvid_Grad2_18x18_To_8x8_mmx:
push ebx
mov ecx, [esp+4 +4] ; Dst
mov ebx, [esp+8 +4] ; Dst_BpS
mov edx, [esp+12 +4] ; Src
mov eax, [esp+16 +4] ; Src_BpS
sub edx, eax
; process columns 0-3
LOAD_G2 mm0,mm1, edx+8
LOAD_G2 mm4,mm5, edx+eax+8
lea edx, [edx+2*eax]
ADD_TIMES mm0,mm1, mm4,mm5, [Cst3]
STORE_G2 0, 8
STORE_G2 0, 8
STORE_G2 0, 8
STORE_G2 0, 8
; process columns 4-7
mov ecx, [esp+4 +4] ; Dst
mov edx, [esp+12 +4] ; Src
sub edx, eax
LOAD_G2 mm0,mm1, edx
LOAD_G2 mm4,mm5, edx+eax
lea edx, [edx+2*eax]
ADD_TIMES mm0,mm1, mm4,mm5, [Cst3]
STORE_G2 1, 0
STORE_G2 1, 0
STORE_G2 1, 0
STORE_G2 1, 0
pop ebx
ret
;//////////////////////////////////////////////////////////////////////
;// for Gradx,Grady,Smooth, the scheme is different than=20
;// the C-version (=3D>lower op count).
%macro LOAD_S 2 ;%1: Dst, %2:src
movq mm6, [Mask_ff]
movq %1, [%2+1] ; 12345678 =20
movq mm4, [%2] ; 01234567
movq mm5, %1
pand mm4, mm6 ; 0|2|4|6 =20
pand mm5, mm6 ; 1|3|5|7
psrlq %1, 8
pand %1, mm6 ; 2|4|6|8
pand mm6, [%2-1] ;-1|1|3|5
paddusw mm5, mm4
paddusw %1, mm6
pmullw mm5, mm7 ; x[Cst3]
paddusw %1, mm5
%endmacro
%macro MIX_S 4 ; %1-%4: regs. output:%1
paddusw %1, [Cst32]
paddusw %2, %3
paddusw %1, %4
pmullw %2, mm7 ; x[Cst3]
paddusw %1, %2
psraw %1, 6
%endmacro
%macro STORE_S 2 ; %1:op type (0:tmp store, 1:final pack), %2:src offset
LOAD_S mm2, edx+%2
LOAD_S mm3, edx+eax+%2
lea edx, [edx+2*eax]
MIX_S mm0,mm1,mm2,mm3
%if (%1=3D=3D1)
packuswb mm0, [ecx]
%endif
movq [ecx], mm0
LOAD_S mm0, edx+%2
LOAD_S mm1, edx+eax+%2
lea edx, [edx+2*eax]
MIX_S mm2,mm3, mm0,mm1
%if (%1=3D=3D1)
packuswb mm2, [ecx+ebx]
%endif
movq [ecx+ebx], mm2
lea ecx, [ecx+2*ebx]
%endmacro
align 16
xvid_Smooth_18x18_To_8x8_mmx:
push ebx
mov ecx, [esp+4 +4] ; Dst
mov ebx, [esp+8 +4] ; Dst_BpS
mov edx, [esp+12 +4] ; Src
mov eax, [esp+16 +4] ; Src_BpS
movq mm7, [Cst3]
sub edx, eax
; process columns 0-3
LOAD_S mm0, edx+8
LOAD_S mm1, edx+eax+8
lea edx, [edx+2*eax]
STORE_S 0, 8
STORE_S 0, 8
STORE_S 0, 8
STORE_S 0, 8
; process columns 4-7
mov ecx, [esp+4 +4] ; Dst
mov edx, [esp+12 +4] ; Src
sub edx, eax
LOAD_S mm0, edx
LOAD_S mm1, edx+eax
lea edx, [edx+2*eax]
STORE_S 1, 0
STORE_S 1, 0
STORE_S 1, 0
STORE_S 1, 0
pop ebx
ret
;//////////////////////////////////////////////////////////////////////
%macro LOAD_GX 2 ;%1: Dst, %2:src
movq mm6, [Mask_ff]
movq mm4, [%2] ; 01234567
movq %1, [%2+1] ; 12345678
movq mm5, %1
psrlq %1, 8 =20
pand mm4, mm6 ; 0|2|4|6 =20
pand mm5, mm6 ; 1|3|5|7
pand %1, mm6 ; 2|4|6|8 =20
pand mm6, [%2-1] ;-1|1|3|5
psubsw mm5, mm4 =20
psubsw %1, mm6
pmullw mm5, mm7 ; x[Cst3]
paddsw %1, mm5 =20
%endmacro
%macro MIX_GX 4 ; %1-%4: regs. output:%1
paddsw %1, [Cst64]
paddsw %2, %3
paddsw %1, %4
pmullw %2, mm7 ; x[Cst3]
paddsw %1, %2
psraw %1, 7
%endmacro
%macro STORE_GX 2 ; %1:op type (0:tmp store, 1:final pack), %2:src offset
LOAD_GX mm2, edx+%2
LOAD_GX mm3, edx+eax+%2
lea edx, [edx+2*eax]
MIX_GX mm0,mm1, mm2,mm3
%if (%1=3D=3D1)
packsswb mm0, [ecx]
%endif
movq [ecx], mm0
LOAD_GX mm0, edx+%2
LOAD_GX mm1, edx+eax+%2
lea edx, [edx+2*eax]
MIX_GX mm2,mm3, mm0,mm1
%if (%1=3D=3D1)
packsswb mm2, [ecx+ebx]
%endif
movq [ecx+ebx], mm2
lea ecx, [ecx+2*ebx]
%endmacro
align 16
xvid_Gradx_18x18_To_8x8_mmx:
push ebx
mov ecx, [esp+4 +4] ; Dst
mov ebx, [esp+8 +4] ; Dst_BpS
mov edx, [esp+12 +4] ; Src
mov eax, [esp+16 +4] ; Src_BpS
movq mm7, [Cst3]
sub edx, eax
; process columns 0-3
LOAD_GX mm0, edx+8
LOAD_GX mm1, edx+eax+8
lea edx, [edx+2*eax]
STORE_GX 0, 8
STORE_GX 0, 8
STORE_GX 0, 8
STORE_GX 0, 8
; process columns 4-7
mov ecx, [esp+4 +4] ; Dst
mov edx, [esp+12 +4] ; Src
sub edx, eax
LOAD_GX mm0, edx
LOAD_GX mm1, edx+eax
lea edx, [edx+2*eax]
STORE_GX 1, 0
STORE_GX 1, 0
STORE_GX 1, 0
STORE_GX 1, 0
pop ebx
ret
;//////////////////////////////////////////////////////////////////////
%macro MIX_GY 4 ; %1-%4: regs. output:mm5
movq mm5, [Cst64] =20
psubsw %2, %3
psubsw %1, %4
pmullw %2, mm7 ; x[Cst3]
psubsw mm5, %1 =20
psubsw mm5, %2
psraw mm5, 7
%endmacro
%macro STORE_GY 2 ; %1:op type (0:tmp store, 1:final pack), %2:src offset
LOAD_S mm2, edx+%2
LOAD_S mm3, edx+eax+%2
lea edx, [edx+2*eax]
MIX_GY mm0,mm1, mm2,mm3
%if (%1=3D=3D1)
packsswb mm5, [ecx]
%endif
movq [ecx], mm5
LOAD_S mm0, edx+%2
LOAD_S mm1, edx+eax+%2
lea edx, [edx+2*eax]
MIX_GY mm2,mm3, mm0,mm1
%if (%1=3D=3D1)
packsswb mm5, [ecx+ebx]
%endif
movq [ecx+ebx], mm5
lea ecx, [ecx+2*ebx]
%endmacro
align 16
xvid_Grady_18x18_To_8x8_mmx:
push ebx
mov ecx, [esp+4 +4] ; Dst
mov ebx, [esp+8 +4] ; Dst_BpS
mov edx, [esp+12 +4] ; Src
mov eax, [esp+16 +4] ; Src_BpS
movq mm7, [Cst3]
sub edx, eax
; process columns 0-3
LOAD_S mm0, edx+8
LOAD_S mm1, edx+eax+8
lea edx, [edx+2*eax]
STORE_GY 0, 8
STORE_GY 0, 8
STORE_GY 0, 8
STORE_GY 0, 8
; process columns 4-7
mov ecx, [esp+4 +4] ; Dst
mov edx, [esp+12 +4] ; Src
sub edx, eax
LOAD_S mm0, edx
LOAD_S mm1, edx+eax
lea edx, [edx+2*eax]
STORE_GY 1, 0
STORE_GY 1, 0
STORE_GY 1, 0
STORE_GY 1, 0
pop ebx
ret
;//////////////////////////////////////////////////////////////////////
--=-WCokwskIzD+8NsdgrdDX--