[XviD-devel] new .asm file (dev-api-3 only)

skal xvid-devel@xvid.org
26 Sep 2002 11:22:08 +0200


--=-T3ZzSrv5ga3wxewjct2o
Content-Type: text/plain
Content-Transfer-Encoding: 7bit


	Hi Micheal and all

On Wed, 2002-09-25 at 19:56, Michael Militzer wrote:
> Hi,
> 
> > oh, talking about ASM, I forgot to mention I'm currently
> > coding an ASM version of image_setedges(), like Gruel
> > suggested... is someone also working on it?
> 
> hm, maybe better don't do it (yet). I wrote a faster setedges() function
> months ago by simply replacing the memcpys and memsets with faster mmx
> equivalents. However I never commited this modification a) out of laziness
> and b) because pete planned to do the edge mirroring block based (as part of
> interpolate8x8_switch() before the interpolation/copy step...).
> 
	hmm, if I guess right, these fast memcpy funcs are like to:
	a) use some non-temporal prefetch
	b) have a lower limit on the size of the block they handle
	with MMX/SSE (64bytes or something)

	My feeling is that: point b) will prevent speed-up on
	the left/right edges replication. And point a) will have
	erratic result on top edge, since this part of the picture
	is re-used almost at once when jumping on the ME (not quite
	sure 'bout it, well...). I've played around with the
	prefetch/movntq stuff, and am not really convinced.

	Anyway, here's the function, just test it. It's just a
	straightforward impl. of the loops required, no nifty
	killah optims...
	It gives the same encoded file size on my bitstream test 
	set, but:
	- I don't have weird video sources with dimensions not
	multiple of 8 (/16).
	- I just happen to notice the EDGE_SIZE equals 32 (in image.h),
	instead of the 16 I would have expected. Is there any reason
	for this? Large search window in motion estimation?
	Anyhow, my func only replicates an EDGE_SIZE of 16, not 32,
	and hence gives different results than the C-one... I can
	fix it, though, if EDGE_SIZE really should be 32. I'll be
	slower, of course...
	I put the corresponding bench entry for 'xvid_bench.c', too.


> So an image_setedges() replacement might not be a good idea because it might
> not be needed anymore sooner or later. But if you like, you could do the
> block based implementation instead ;-)

	I've read the doc about it, and jumped on my Prozac pills
	short after :))

	bye,
		Skal



Entry for xvid_bench.c:

/*********************************************************************
 * test image_setedges_XXX()
 *********************************************************************/

  // should be moved to image/image.h...
extern void image_setedges_mmx(IMAGE * image,
	  uint32_t edged_width,
	  uint32_t edged_height,
	  uint32_t width,
	  uint32_t height,
	  uint32_t interlacing);

#define TEST_EDGE(FUNC)                       \
    for(i=0; i<(int)sizeof(YU); ++i)          \
	YU[i] = (i*54+23)&0xff;                 \
    t = gettime_usec();                       \
    for(tst=0; tst<nb_tests; ++tst) {         \
	(FUNC)(&YUV, EW, EH, W, H, 0);          \
    }                                         \
    emms();                                   \
    t = (gettime_usec()-t ) / nb_tests;       \
    for(crc=0,i=0; i<(int)sizeof(YU); ++i)	\
	crc = ((crc+(YU[i]^i))^(crc>>8))&0xffff

void test_edges()
{
  const int nb_tests = 10*speed_ref;
  const int W = 320, H = 240;
  const int EW = ((W+15)&~15) + 2*EDGE_SIZE;
  const int EH = ((H+15)&~15) + 2*EDGE_SIZE;
  const int YSize = EW*EH;
  const int UVSize = (EW/2)*(EH/2);
  const int offset  = EDGE_SIZE  + EDGE_SIZE*EW;
  const int offset2 = EDGE_SIZE/2+(EDGE_SIZE/2)*(EW/2);
  unsigned char YU[YSize+2*UVSize];
  IMAGE YUV;

  int i, crc, tst;
  double t;

  YUV.y = YU       +offset;
  YUV.u = YU+YSize +offset2;
  YUV.v = YUV.u + UVSize;

  printf( "\n =====  test edges =====\n" );

  init_cpu(&cpu_list[0]); // PLAINC
  TEST_EDGE(image_setedges);
  printf( "image_setedges	%.3f usec       crc=0x%x\n", t, crc );
//  if (crc!=0x52a7) printf( "*** CRC ERROR! ***\n" );
// the crc is meaningless for now, since EDGE_SIZE=32 in the C-version,
// and 16 in the MMX one. TODO: sort out this issue...

#if defined(ARCH_X86)
  init_cpu(&cpu_list[1]); // MMX
  TEST_EDGE(image_setedges_mmx);
  printf( "image_setedges_mmx %.3f usec       crc=0x%x\n", t, crc );
//  if (crc!=0x52a7) printf( "*** CRC ERROR! ***\n" );
#endif
}




--=-T3ZzSrv5ga3wxewjct2o
Content-Disposition: attachment; filename=edge_mmx.asm
Content-Transfer-Encoding: quoted-printable
Content-Type: text/plain; name=edge_mmx.asm; charset=ISO-8859-1

;/*************************************************************************=
****
; *
; *  XVID MPEG-4 VIDEO CODEC
; *	  edges replication
; *
; *  Copyright(C) 2002 Pascal Massimino (skal@planet-d.net)
; *
; *  This program is an implementation of a part of one or more MPEG-4
; *  Video tools as specified in ISO/IEC 14496-2 standard.  Those intending
; *  to use this software module in hardware or software products are
; *  advised that its use may infringe existing patents or copyrights, and
; *  any such use would be at such party's own risk.  The original
; *  developer of this software module and his/her company, and subsequent
; *  editors and their companies, will have no liability for use of this
; *  software or modifications or derivatives thereof.
; *
; *  This program is free software; you can redistribute it and/or modify
; *  it under the terms of the GNU General Public License as published by
; *  the Free Software Foundation; either version 2 of the License, or
; *  (at your option) any later version.
; *
; *  This program is distributed in the hope that it will be useful,
; *  but WITHOUT ANY WARRANTY; without even the implied warranty of
; *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
; *  GNU General Public License for more details.
; *
; *  You should have received a copy of the GNU General Public License
; *  along with this program; if not, write to the Free Software
; *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 U=
SA
; *
; *************************************************************************=
***/

bits 32

%macro cglobal 1=20
	%ifdef PREFIX
		global _%1=20
		%define %1 _%1
	%else
		global %1
	%endif
%endmacro

section .text

cglobal image_setedges_mmx

;//////////////////////////////////////////////////////////////////////
;// void
;// image_setedges_mmx(IMAGE * image,
;//			   uint32_t edged_width,
;//			   uint32_t edged_height,
;//			   uint32_t width,
;//			   uint32_t height,
;//			   uint32_t interlacing)
;//////////////////////////////////////////////////////////////////////

%macro B_TO_Q 4   ; %1:SrcLeft, %2:Width, %3:mm#0, %4:mm#1
    ; replicate left and right bytes into quardwords
  movq %4, [%1+%2-8]    ; <=3D -8 offset: we align Right read (most common =
case)
  movq %3, [%1]
  punpckhbw %4,%4
  punpcklbw %3,%3
  pshufw %4,%4,0xff
  pshufw %3,%3,0x00
%endmacro

%define IMAGE esp+4
%define EWIDTH esp+8
%define WIDTH esp+16
%define HEIGHT esp+20
%define IMAGE_Y 0   ; Warning! these offsets reflect the IMAGE struct field=
s.
%define IMAGE_U 4
%define IMAGE_V 8

align 16
image_setedges_mmx: ; approx.: ~20200c@512x384, ~21500c@640x352, ~13460c@35=
2x240

  push ebx
  push esi
  push edi
  push ebp
  mov esi, [IMAGE +16]
  mov ebp, [EWIDTH +16]  ; BpS

    ; //// Y border loops ////

      ; 1) Left/Right     =20

  mov edx, [WIDTH +16]     ; Width
  mov edi, [HEIGHT +16]    ; Height
  mov ebx, [esi + IMAGE_Y] ; Image->y
  jmp .Loop_LR_Y

align 16
.Loop_LR_Y
  dec edi
  B_TO_Q ebx, edx, mm0, mm1
  movq [ebx-16], mm0
  movq [ebx+edx], mm1
  movq [ebx- 8], mm0
  movq [ebx+edx+8], mm1
  jle .Out_LR_Y
  lea  ebx, [ebx+ebp]
  jmp .Loop_LR_Y

.Out_LR_Y

      ; 2) Bottom Y

  ; ebx is correctly positioned

  lea ebx, [ebx+edx]
  mov eax, 16
  mov ecx, ebx
  mov edi, -16
  sub edi, edx    ; edi=3D-16-Width

  mov edx, edi
  lea ebx, [ebx+ebp]
  jmp .Loop_B_Y

align 16
.Loop_B_Y
  add edx, 16
  movq mm0, [ecx+edx-16  ]
  movq mm1, [ecx+edx-16+8]
  movq [ebx+edx-16  ], mm0
  movq [ebx+edx-16+8], mm1
  jle .Loop_B_Y
  dec eax
  mov edx, edi
  jle .Out_B_Y
  lea ebx, [ebx+ebp]
  jmp .Loop_B_Y

.Out_B_Y:

    ; 3) Top Y

  mov eax, [WIDTH +16]
  xor ebp, -1
  mov ebx, [esi+ IMAGE_Y]
  inc ebp       ; ebp =3D -BpS

  add ebx, eax  ; Y+Width
  mov eax, 16
  lea ecx, [ebx+ebp]
  jmp .Loop_T_Y

align 16
.Loop_T_Y
  add edx, 16
  movq mm0, [ebx+edx-16  ]
  movq mm1, [ebx+edx-16+8]
  movq [ecx+edx-16  ], mm0
  movq [ecx+edx-16+8], mm1
  jle .Loop_T_Y
  dec eax =20
  jle .Out_T_Y
  mov edx, edi =20
  lea ecx, [ecx+ebp]
  jmp .Loop_T_Y

.Out_T_Y


    ; //// UV border loops ////

      ; 1) left/right =20

  mov esi, [IMAGE +16]   ; image
  mov edx, [WIDTH +16]   ; Width
  mov edi, [HEIGHT +16]  ; Height
  mov ebp, [EWIDTH +16]  ; BpS
  shr edx, 1             ; width/=3D2
  shr edi, 1             ; Height/2
  shr ebp, 1             ; BpS/2
  mov ebx, [esi +IMAGE_U]; U
  mov ecx, [esi +IMAGE_V]; V
  jmp .Skip_LR_UV

align 16
.Loop_LR_UV
  lea  ebx, [ebx+ebp]
  lea  ecx, [ecx+ebp]
.Skip_LR_UV=20
  B_TO_Q ebx, edx, mm0, mm1
  B_TO_Q ecx, edx, mm2, mm3
  movq [ebx-8], mm0
  movq [ebx+edx+0], mm1
  movq [ecx-8], mm2
  movq [ecx+edx+0], mm3
  dec edi
  jg .Loop_LR_UV


    ; 2) Bottom UV=20

  ; ebx/ecx are correctly positioned

  lea ebx, [ebx+edx]
  lea ecx, [ecx+edx]
  mov edi, ecx
  mov esi, ebx
  mov eax,-16
  sub eax, edx    ; eax =3D -16-Width/2
  mov edx, 8
  push eax
.Loop_B_UV_y
  mov eax, [esp]
  lea ebx, [ebx+ebp]
  lea ecx, [ecx+ebp]
  jmp .Loop_B_UV

align 16
.Loop_B_UV
  add eax, 8
  movq mm0, [esi+eax]
  movq mm1, [edi+eax]
  movq [ebx+eax], mm0
  movq [ecx+eax], mm1
  jl .Loop_B_UV
  dec edx
  jle .Out_B_UV
  jmp .Loop_B_UV_y
.Out_B_UV:

    ; 3) Top UV

  mov esi, [IMAGE +16+4]  ; YVU
  mov edx, [WIDTH +16+4]  ; Width
  mov edi, [esi +IMAGE_U] ; U
  mov esi, [esi +IMAGE_V] ; V
  shr edx, 1              ; Width/2
  lea edi, [edi+edx]
  lea esi, [esi+edx]
  xor ebp, -1
  mov ebx, esi
  inc ebp       ; ebp =3D -BpS
  mov ecx, edi
  mov edx, 8

.Loop_T_UV_y
  mov eax, [esp]
  lea ebx, [ebx+ebp]
  lea ecx, [ecx+ebp]
  jmp .Loop_T_UV

align 16
.Loop_T_UV
  add eax, 8
  movq mm0, [esi+eax]
  movq mm1, [edi+eax]
  movq [ebx+eax], mm0
  movq [ecx+eax], mm1
  jl .Loop_T_UV
  dec edx
  jg .Loop_T_UV_y

  pop eax


  pop ebp
  pop edi
  pop esi
  pop ebx

  emms      ; TODO: really needed?
  ret

--=-T3ZzSrv5ga3wxewjct2o--