[XviD-devel] dynamic hpel/qpel decision

skal xvid-devel@xvid.org
04 Dec 2002 11:51:43 +0100


--=-WCokwskIzD+8NsdgrdDX
Content-Type: text/plain
Content-Transfer-Encoding: 7bit


	Hi Radek and all,

On Tue, 2002-12-03 at 16:59, skal wrote:
> > On Mon, 2002-12-02 at 12:10, Radek Czyz wrote:
> > 
> > > PS does someone feel like writing an asm-ed Sobel edge detection? It's
> > > not slow the way it is (copied from RefDivX's adaptive quantization
> > > code) but can always be faster.

> 	I've got some C-code here, for basic filtering.
> 	The corresponding MMX version has a little bug I
> 	need to find time to fix. 

	That was worth it! I found: some bugs (as usual),
	some incoherences (rounding for gradx/grady),
	and some decent speedups (mainly in C).
	So here are some more reliable versions.
	bye,
		Skal

PS: i've also included the code (commented) for using a threshold
with the norm of gradient. Look around 'THRESH' and 'Thresh',
respectively...


--=-WCokwskIzD+8NsdgrdDX
Content-Disposition: attachment; filename=filter.c
Content-Transfer-Encoding: quoted-printable
Content-Type: text/x-c; name=filter.c; charset=ISO-8859-1

/**************************************************************************=
***
 *
 *  XVID MPEG-4 VIDEO CODEC
 *   4x4 filtering utilities
 *
 *  Copyright(C) 2002 Pascal Massimino <skal@planet-d.net>
 *
 *  This file is part of XviD, a free MPEG-4 video encoder/decoder
 *
 *  XviD is free software; you can redistribute it and/or modify it
 *  under the terms of the GNU General Public License as published by
 *  the Free Software Foundation; either version 2 of the License, or
 *  (at your option) any later version.
 *
 *  This program is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with this program; if not, write to the Free Software
 *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 US=
A
 *
 *  Under section 8 of the GNU General Public License, the copyright
 *  holders of XVID explicitly forbid distribution in the following
 *  countries:
 *
 *    - Japan
 *    - United States of America
 *
 *  Linking XviD statically or dynamically with other modules is making a
 *  combined work based on XviD.  Thus, the terms and conditions of the
 *  GNU General Public License cover the whole combination.
 *
 *  As a special exception, the copyright holders of XviD give you
 *  permission to link XviD with independent modules that communicate with
 *  XviD solely through the VFW1.1 and DShow interfaces, regardless of the
 *  license terms of these independent modules, and to copy and distribute
 *  the resulting combined work under terms of your choice, provided that
 *  every copy of the combined work is accompanied by a complete copy of
 *  the source code of XviD (the version of XviD used to produce the
 *  combined work), being distributed under the terms of the GNU General
 *  Public License plus this exception.  An independent module is a module
 *  which is not derived from or based on XviD.
 *
 *  Note that people who make modified versions of XviD are not obligated
 *  to grant this special exception for their modified versions; it is
 *  their choice whether to do so.  The GNU General Public License gives
 *  permission to release a modified version without this exception; this
 *  exception also makes it possible to release a modified version which
 *  carries forward this exception.
 *
 * $Id: filter.c,v 1.11 2002/11/17 00:32:06 edgomez Exp $
 *
 **************************************************************************=
**/

  /* prototypes */

#include <stdint.h>

    /* C version */

extern void xvid_Gradx_18x18_To_8x8_C(int8_t *Dst, int32_t Dst_BpS,
                                      const uint8_t *Src, int32_t Src_BpS);
extern void xvid_Grady_18x18_To_8x8_C(int8_t *Dst, int32_t Dst_BpS,
                                      const uint8_t *Src, int32_t Src_BpS);
extern void xvid_Grad2_18x18_To_8x8_C(uint8_t *Dst, int32_t Dst_BpS,
                                      const uint8_t *Src, int32_t Src_BpS);
extern void xvid_Smooth_18x18_To_8x8_C(uint8_t *Dst, int32_t Dst_BpS,
                                       const uint8_t *Src, int32_t Src_BpS)=
;

    /* MMX version */

extern void xvid_Gradx_18x18_To_8x8_mmx(int8_t *Dst, int32_t Dst_BpS,
                                        const uint8_t *Src, int32_t Src_BpS=
);
extern void xvid_Grady_18x18_To_8x8_mmx(int8_t *Dst, int32_t Dst_BpS,
                                        const uint8_t *Src, int32_t Src_BpS=
);
extern void xvid_Grad2_18x18_To_8x8_mmx(uint8_t *Dst, int32_t Dst_BpS,
                                        const uint8_t *Src, int32_t Src_BpS=
);
extern void xvid_Smooth_18x18_To_8x8_mmx(uint8_t *Dst, int32_t Dst_BpS,
                                         const uint8_t *Src, int32_t Src_Bp=
S);


/*//////////////////////////////////////////////////////////
// Downsampling 4x4 filters:
//
//        [1 3 3 1]      [-1 -3  3  1]     [-1 -3 -3 -1]
// Smooth:[3 9 9 3]   Gx:[-3 -9  9  3]  Gy:[-3 -9 -9 -3]
//        [3 9 9 3]      [-3 -9  9  3]     [ 3  9  9  3]
//        [1 3 3 1]      [-1 -3  3  1]     [ 1  3  3  1]
//
//  Input:18x18   Output:8x8
//////////////////////////////////////////////////////////*/

#define STORE(x)   *d =3D (x); d +=3D Dst_BpS

void xvid_Smooth_18x18_To_8x8_C(uint8_t *Dst, int32_t Dst_BpS,
                                const uint8_t *Src, int32_t Src_BpS)
{
#define LOAD(x) (x) =3D 3*(s[1]+s[0]) +s[2]+s[-1]; s +=3D Src_BpS

  int i;

  Src -=3D Src_BpS;
  for(i=3D0; i<8; ++i)
  {
    int32_t mx0, mx1, tmp;
    int j;
    const uint8_t *s =3D Src + 2*i;
    uint8_t *d =3D Dst + i;

    LOAD(mx0);
    LOAD(tmp);
    mx0 +=3D 3*tmp;

    for(j=3D4; j>0; --j) {
      LOAD(mx1); mx0 +=3D 3*mx1;
      LOAD(tmp); mx0 +=3D tmp; mx1 +=3D 3*tmp;
      STORE( (32+mx0)>>6 );

      LOAD(mx0); mx1 +=3D 3*mx0;
      LOAD(tmp); mx1 +=3D tmp; mx0 +=3D 3*tmp;
      STORE( (32+mx1)>>6 );
    }
  }
#undef LOAD
}


void xvid_Gradx_18x18_To_8x8_C(int8_t *Dst, int32_t Dst_BpS,
                               const uint8_t *Src, int32_t Src_BpS)
{
#define LOAD(x) (x) =3D   3*(s[1]-s[0]) +s[2]-s[-1]; s +=3D Src_BpS

  int i;

  Src -=3D Src_BpS;
  for(i=3D0; i<8; ++i)
  {
    int32_t mx0, mx1, tmp;
    int j;
    const uint8_t *s =3D Src + 2*i;
    int8_t *d =3D Dst + i;

    LOAD(mx0);
    LOAD(tmp);
    mx0 +=3D 3*tmp;

    for(j=3D4; j>0; --j)
    {
      LOAD(mx1); mx0 +=3D 3*mx1;
      LOAD(tmp); mx0 +=3D tmp;
      mx1 +=3D 3*tmp;
      STORE( (64+mx0)>>7 );

      LOAD(mx0); mx1 +=3D 3*mx0;
      LOAD(tmp); mx1 +=3D tmp;
      mx0 +=3D 3*tmp;
      STORE( (64+mx1)>>7 );
    }
  }
#undef LOAD
}


void xvid_Grady_18x18_To_8x8_C(int8_t *Dst, int32_t Dst_BpS,
                               const uint8_t *Src, int32_t Src_BpS)
{
#define LOAD(x) (x) =3D 3*(s[1]+s[0]) +s[2]+s[-1]; s +=3D Src_BpS

  int i;

  Src -=3D Src_BpS;
  for(i=3D0; i<8; ++i)
  {
    int32_t mx0, mx1, tmp;
    int j;
    const uint8_t *s =3D Src + 2*i;
    int8_t *d =3D Dst + i;

    LOAD(mx0);
    LOAD(tmp);
    mx0 +=3D 3*tmp;

    for(j=3D4; j>0; --j)
    {
      LOAD(mx1); mx0 -=3D 3*mx1;
      LOAD(tmp); mx0 -=3D tmp;
      mx1 +=3D 3*tmp;
      STORE( (64-mx0)>>7 );

      LOAD(mx0); mx1 -=3D 3*mx0;
      LOAD(tmp); mx1 -=3D tmp;
      mx0 +=3D 3*tmp;
      STORE( (64-mx1)>>7 );
    }
  }
#undef LOAD
}


void xvid_Grad2_18x18_To_8x8_C(uint8_t *Dst, int32_t Dst_BpS,
                               const uint8_t *Src, int32_t Src_BpS)
{
#define LOAD(x,y) (x) =3D s[-1] + 3*s[0]; (y) =3D 3*s[1] + s[2]; s +=3D Src=
_BpS
#define THRESH 24

  int i;

  Src -=3D Src_BpS;
  for(i=3D0; i<8; ++i)
  {
    int32_t mx0, mx1, my0, my1, tmpx, tmpy;
    int j;
    const uint8_t *s =3D Src + 2*i;
    uint8_t *d =3D Dst + i;

    LOAD(mx0,my0);=20
    LOAD(tmpx,tmpy);
    mx0 +=3D 3*tmpx; my0 +=3D 3*tmpy;
   =20
    for(j=3D4; j>0; --j)
    {
      LOAD(mx1,my1);   mx0 -=3D 3*my1; my0 -=3D 3*mx1;
      LOAD(tmpx,tmpy); mx0 -=3D tmpy;  my0 -=3D tmpx;
      mx1 +=3D 3*tmpx; my1 +=3D 3*tmpy;

        /*  at this point:=20
             Gx  =3D (64+mx0-my0)>>7
             Gy  =3D (64-mx0-my0)>>7
             =3D> Gx*Gx+Gy*Gy ~=3D 2*( mx0*mx0 + my0*my0 )
        */

      mx0 =3D (mx0+32)>>6; my0 =3D (my0+32)>>6;
      tmpx =3D mx0*mx0 + my0*my0;

      tmpx =3D (tmpx>255 ? 255 : tmpx);
      /* tmpx =3D (tmpx>THRESH ? 255 : 0);  */

      STORE( tmpx );

      LOAD(mx0,my0);   mx1 -=3D 3*my0; my1 -=3D 3*mx0;
      LOAD(tmpx,tmpy); mx1 -=3D tmpy;  my1 -=3D tmpx;
      mx0 +=3D 3*tmpx; my0 +=3D 3*tmpy;
      mx1 =3D (mx1+32)>>6; my1 =3D (my1+32)>>6;
      tmpx =3D mx1*mx1 + my1*my1;

      tmpx =3D (tmpx>255 ? 255 : tmpx);
      /* tmpx =3D (tmpx>THRESH ? 255 : 0);  */

      STORE( tmpx );
    }
  }
#undef THRESH
#undef LOAD
}

#undef STORE

/*//////////////////////////////////////////////////////////*/

--=-WCokwskIzD+8NsdgrdDX
Content-Disposition: attachment; filename=filter_mmx.asm
Content-Transfer-Encoding: quoted-printable
Content-Type: text/plain; name=filter_mmx.asm; charset=ISO-8859-1

;/*************************************************************************=
****
; *
; *  XVID MPEG-4 VIDEO CODEC
; *   4x4 filtering utilities
; *
; *  Copyright(C) 2002 Pascal Massimino <skal@planet-d.net>
; *
; *  This file is part of XviD, a free MPEG-4 video encoder/decoder
; *
; *  XviD is free software; you can redistribute it and/or modify it
; *  under the terms of the GNU General Public License as published by
; *  the Free Software Foundation; either version 2 of the License, or
; *  (at your option) any later version.
; *
; *  This program is distributed in the hope that it will be useful,
; *  but WITHOUT ANY WARRANTY; without even the implied warranty of
; *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
; *  GNU General Public License for more details.
; *
; *  You should have received a copy of the GNU General Public License
; *  along with this program; if not, write to the Free Software
; *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 U=
SA
; *
; *  Under section 8 of the GNU General Public License, the copyright
; *  holders of XVID explicitly forbid distribution in the following
; *  countries:
; *
; *    - Japan
; *    - United States of America
; *
; *  Linking XviD statically or dynamically with other modules is making a
; *  combined work based on XviD.  Thus, the terms and conditions of the
; *  GNU General Public License cover the whole combination.
; *
; *  As a special exception, the copyright holders of XviD give you
; *  permission to link XviD with independent modules that communicate with
; *  XviD solely through the VFW1.1 and DShow interfaces, regardless of the
; *  license terms of these independent modules, and to copy and distribute
; *  the resulting combined work under terms of your choice, provided that
; *  every copy of the combined work is accompanied by a complete copy of
; *  the source code of XviD (the version of XviD used to produce the
; *  combined work), being distributed under the terms of the GNU General
; *  Public License plus this exception.  An independent module is a module
; *  which is not derived from or based on XviD.
; *
; *  Note that people who make modified versions of XviD are not obligated
; *  to grant this special exception for their modified versions; it is
; *  their choice whether to do so.  The GNU General Public License gives
; *  permission to release a modified version without this exception; this
; *  exception also makes it possible to release a modified version which
; *  carries forward this exception.
; *
; * $Id: filter_mmx.asm,v 1.5 2002/11/17 00:32:06 edgomez Exp $
; *
; *************************************************************************=
/

bits 32

%macro cglobal 1=20
	%ifdef PREFIX
		global _%1=20
		%define %1 _%1
	%else
		global %1
	%endif
%endmacro

cglobal xvid_Smooth_18x18_To_8x8_mmx
cglobal xvid_Gradx_18x18_To_8x8_mmx
cglobal xvid_Grady_18x18_To_8x8_mmx
cglobal xvid_Grad2_18x18_To_8x8_mmx

;=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D

section .data

align 16
Cst3 dw  3, 3, 3, 3
Cst32 dw 32,32,32,32
Cst64 dw 64,64,64,64
Mask_ff dw 0xff,0xff,0xff,0xff

Thresh dw 24,24,24,24

;=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D

section .text

;//////////////////////////////////////////////////////////////////////
;// 18x18 -> 8x8 block filtering
;//////////////////////////////////////////////////////////////////////

%macro ADD_TIMES 5   ; (%1) =3D (%1)+(%5)*(%3)   (%2) =3D (%2)+(%5)*(%4)
  pmullw %3, %5
  pmullw %4, %5
  paddsw %1, %3
  paddsw %2, %4
%endmacro

%macro SUB_TIMES 5   ; (%1) =3D (%1)-3*(%3)   (%2) =3D (%2)-(%5)*(%4)
  pmullw %3, %5
  pmullw %4, %5
  psubsw %1, %3
  psubsw %2, %4
%endmacro


%macro LOAD_G2 3  ;%1-%2: Dst, %3:src    mm6-mm7: trashed
  movq mm6, [Mask_ff]
  movq %1,  [%3]      ; 01234567
  movq %2,  [%3+1]    ; 12345678
  movq mm7, %2
  psrlq %2, 8
  pand  %1, mm6       ; 0|2|4|6
  pand mm7, mm6       ; 1|3|5|7
  pand  %2, mm6       ; 2|4|6|8
  pand mm6, [%3-1]    ;-1|1|3|5
  pmullw mm7,  [Cst3]
  pmullw %1,  [Cst3]
  paddsw %2, mm7
  paddsw %1, mm6
%endmacro

%macro MIX_G2 2    ; %1-2: regs.  output:%1
  paddsw %1, [Cst32]
  paddsw %2, [Cst32]
  psraw %1, 6
  psraw %2, 6
  pmullw %1, %1
  pmullw %2, %2
  paddusw %1, %2

; uncomment the following instr for thresholding:
;  pcmpgtw %1, [Thresh]
; and change 'packuswb' into 'packsswb' in STORE_G2 below

%endmacro

%macro STORE_G2 2  ; %1:op type (0:tmp store, 1:final pack), %2:src offset
  LOAD_G2 mm2,mm3, edx+%2

  movq mm4,mm2
  movq mm5,mm3
  SUB_TIMES mm0,mm1, mm5,mm4, [Cst3]

  LOAD_G2 mm4,mm5, edx+eax+%2
  lea edx, [edx+2*eax]

  psubsw mm0, mm5
  psubsw mm1, mm4
  ADD_TIMES mm2,mm3, mm4,mm5, [Cst3]

  MIX_G2 mm0,mm1
%if (%1=3D=3D1)
  packuswb mm0, [ecx]
  ; packsswb mm0, [ecx]     ; <=3D use instead, for threshold
%endif
  movq [ecx], mm0

  LOAD_G2 mm0, mm1, edx+%2

  movq mm4,mm0
  movq mm5,mm1
  SUB_TIMES mm2,mm3, mm5,mm4, [Cst3]

  LOAD_G2 mm4,mm5, edx+eax+%2
  lea edx, [edx+2*eax]

  psubsw mm2, mm5
  psubsw mm3, mm4
  ADD_TIMES mm0,mm1, mm4,mm5, [Cst3]

  MIX_G2 mm2,mm3
%if (%1=3D=3D1)
  packuswb mm2, [ecx+ebx]
  ; packsswb mm2, [ecx+ebx]     ; <=3D use instead, for threshold
%endif
  movq [ecx+ebx], mm2
  lea ecx, [ecx+2*ebx]
%endmacro


align 16
xvid_Grad2_18x18_To_8x8_mmx:
  push ebx

  mov ecx, [esp+4  +4] ; Dst
  mov ebx, [esp+8  +4] ; Dst_BpS
  mov edx, [esp+12 +4] ; Src
  mov eax, [esp+16 +4] ; Src_BpS
  sub edx, eax

      ; process columns 0-3

  LOAD_G2 mm0,mm1, edx+8
  LOAD_G2 mm4,mm5, edx+eax+8
  lea edx, [edx+2*eax]
  ADD_TIMES mm0,mm1, mm4,mm5, [Cst3]

  STORE_G2 0, 8
  STORE_G2 0, 8
  STORE_G2 0, 8
  STORE_G2 0, 8

      ; process columns 4-7

  mov ecx, [esp+4  +4] ; Dst
  mov edx, [esp+12 +4] ; Src
  sub edx, eax

  LOAD_G2 mm0,mm1, edx
  LOAD_G2 mm4,mm5, edx+eax
  lea edx, [edx+2*eax]
  ADD_TIMES mm0,mm1, mm4,mm5, [Cst3]

  STORE_G2 1, 0
  STORE_G2 1, 0
  STORE_G2 1, 0
  STORE_G2 1, 0

  pop ebx
  ret

;//////////////////////////////////////////////////////////////////////
;// for Gradx,Grady,Smooth, the scheme is different than=20
;// the C-version (=3D>lower op count).

%macro LOAD_S 2  ;%1: Dst, %2:src
  movq mm6, [Mask_ff]
  movq %1,  [%2+1]    ; 12345678 =20
  movq mm4, [%2]      ; 01234567
  movq mm5, %1
  pand mm4, mm6       ; 0|2|4|6 =20
  pand mm5, mm6       ; 1|3|5|7
  psrlq %1, 8
  pand  %1, mm6       ; 2|4|6|8
  pand mm6, [%2-1]    ;-1|1|3|5
  paddusw mm5, mm4
  paddusw %1, mm6
  pmullw mm5,  mm7  ; x[Cst3]
  paddusw %1, mm5
%endmacro

%macro MIX_S 4    ; %1-%4: regs.  output:%1
  paddusw %1, [Cst32]
  paddusw %2, %3
  paddusw %1, %4
  pmullw %2, mm7  ; x[Cst3]
  paddusw %1, %2
  psraw %1, 6
%endmacro

%macro STORE_S 2   ; %1:op type (0:tmp store, 1:final pack), %2:src offset
  LOAD_S mm2, edx+%2
  LOAD_S mm3, edx+eax+%2
  lea edx, [edx+2*eax]

  MIX_S mm0,mm1,mm2,mm3
%if (%1=3D=3D1)
  packuswb mm0, [ecx]
%endif
  movq [ecx], mm0

  LOAD_S mm0, edx+%2
  LOAD_S mm1, edx+eax+%2
  lea edx, [edx+2*eax]
  MIX_S mm2,mm3, mm0,mm1
%if (%1=3D=3D1)
  packuswb mm2, [ecx+ebx]
%endif
  movq [ecx+ebx], mm2
  lea ecx, [ecx+2*ebx]
%endmacro


align 16
xvid_Smooth_18x18_To_8x8_mmx:
  push ebx

  mov ecx, [esp+4  +4] ; Dst
  mov ebx, [esp+8  +4] ; Dst_BpS
  mov edx, [esp+12 +4] ; Src
  mov eax, [esp+16 +4] ; Src_BpS

  movq mm7, [Cst3]
  sub edx, eax

      ; process columns 0-3

  LOAD_S mm0, edx+8
  LOAD_S mm1, edx+eax+8
  lea edx, [edx+2*eax]

  STORE_S 0, 8
  STORE_S 0, 8
  STORE_S 0, 8
  STORE_S 0, 8

      ; process columns 4-7

  mov ecx, [esp+4  +4] ; Dst
  mov edx, [esp+12 +4] ; Src
  sub edx, eax

  LOAD_S mm0, edx
  LOAD_S mm1, edx+eax
  lea edx, [edx+2*eax]

  STORE_S 1, 0
  STORE_S 1, 0
  STORE_S 1, 0
  STORE_S 1, 0

  pop ebx
  ret

;//////////////////////////////////////////////////////////////////////

%macro LOAD_GX 2  ;%1: Dst, %2:src
  movq mm6, [Mask_ff]
  movq mm4,  [%2]      ; 01234567
  movq %1,   [%2+1]    ; 12345678
  movq mm5, %1
  psrlq %1, 8 =20
  pand mm4, mm6       ; 0|2|4|6 =20
  pand mm5, mm6       ; 1|3|5|7
  pand  %1, mm6       ; 2|4|6|8 =20
  pand mm6, [%2-1]    ;-1|1|3|5
  psubsw mm5, mm4 =20
  psubsw %1, mm6
  pmullw mm5,  mm7  ; x[Cst3]
  paddsw %1, mm5 =20
%endmacro

%macro MIX_GX 4    ; %1-%4: regs.  output:%1
  paddsw %1, [Cst64]
  paddsw %2, %3
  paddsw %1, %4
  pmullw %2, mm7  ; x[Cst3]
  paddsw %1, %2
  psraw %1, 7
%endmacro

%macro STORE_GX 2  ; %1:op type (0:tmp store, 1:final pack), %2:src offset
  LOAD_GX mm2, edx+%2
  LOAD_GX mm3, edx+eax+%2
  lea edx, [edx+2*eax]

  MIX_GX mm0,mm1, mm2,mm3
%if (%1=3D=3D1)
  packsswb mm0, [ecx]
%endif
  movq [ecx], mm0

  LOAD_GX mm0, edx+%2
  LOAD_GX mm1, edx+eax+%2
  lea edx, [edx+2*eax]

  MIX_GX mm2,mm3, mm0,mm1
%if (%1=3D=3D1)
  packsswb mm2, [ecx+ebx]
%endif
  movq [ecx+ebx], mm2
  lea ecx, [ecx+2*ebx]
%endmacro


align 16
xvid_Gradx_18x18_To_8x8_mmx:
  push ebx

  mov ecx, [esp+4  +4] ; Dst
  mov ebx, [esp+8  +4] ; Dst_BpS
  mov edx, [esp+12 +4] ; Src
  mov eax, [esp+16 +4] ; Src_BpS

  movq mm7, [Cst3]
  sub edx, eax

      ; process columns 0-3
  LOAD_GX mm0, edx+8
  LOAD_GX mm1, edx+eax+8
  lea edx, [edx+2*eax]

  STORE_GX 0, 8
  STORE_GX 0, 8
  STORE_GX 0, 8
  STORE_GX 0, 8

      ; process columns 4-7

  mov ecx, [esp+4  +4] ; Dst
  mov edx, [esp+12 +4] ; Src
  sub edx, eax

  LOAD_GX mm0, edx
  LOAD_GX mm1, edx+eax
  lea edx, [edx+2*eax]

  STORE_GX 1, 0
  STORE_GX 1, 0
  STORE_GX 1, 0
  STORE_GX 1, 0

  pop ebx
  ret

;//////////////////////////////////////////////////////////////////////

%macro MIX_GY 4    ; %1-%4: regs.  output:mm5
  movq mm5, [Cst64] =20
  psubsw %2, %3
  psubsw %1, %4
  pmullw %2, mm7  ; x[Cst3]
  psubsw mm5, %1  =20
  psubsw mm5, %2
  psraw mm5, 7
%endmacro

%macro STORE_GY 2  ; %1:op type (0:tmp store, 1:final pack), %2:src offset
  LOAD_S mm2, edx+%2
  LOAD_S mm3, edx+eax+%2
  lea edx, [edx+2*eax]

  MIX_GY mm0,mm1, mm2,mm3
%if (%1=3D=3D1)
  packsswb mm5, [ecx]
%endif
  movq [ecx], mm5

  LOAD_S mm0, edx+%2
  LOAD_S mm1, edx+eax+%2
  lea edx, [edx+2*eax]

  MIX_GY mm2,mm3, mm0,mm1
%if (%1=3D=3D1)
  packsswb mm5, [ecx+ebx]
%endif
  movq [ecx+ebx], mm5
  lea ecx, [ecx+2*ebx]
%endmacro

align 16
xvid_Grady_18x18_To_8x8_mmx:
  push ebx

  mov ecx, [esp+4  +4] ; Dst
  mov ebx, [esp+8  +4] ; Dst_BpS
  mov edx, [esp+12 +4] ; Src
  mov eax, [esp+16 +4] ; Src_BpS

  movq mm7, [Cst3]
  sub edx, eax

      ; process columns 0-3

  LOAD_S mm0, edx+8
  LOAD_S mm1, edx+eax+8
  lea edx, [edx+2*eax]

  STORE_GY 0, 8
  STORE_GY 0, 8
  STORE_GY 0, 8
  STORE_GY 0, 8

      ; process columns 4-7

  mov ecx, [esp+4  +4] ; Dst
  mov edx, [esp+12 +4] ; Src
  sub edx, eax

  LOAD_S mm0, edx
  LOAD_S mm1, edx+eax
  lea edx, [edx+2*eax]

  STORE_GY 1, 0
  STORE_GY 1, 0
  STORE_GY 1, 0
  STORE_GY 1, 0

  pop ebx
  ret

;//////////////////////////////////////////////////////////////////////

--=-WCokwskIzD+8NsdgrdDX--