[XviD-devel] new .asm file (dev-api-3 only)
skal
xvid-devel@xvid.org
26 Sep 2002 11:22:08 +0200
--=-T3ZzSrv5ga3wxewjct2o
Content-Type: text/plain
Content-Transfer-Encoding: 7bit
Hi Micheal and all
On Wed, 2002-09-25 at 19:56, Michael Militzer wrote:
> Hi,
>
> > oh, talking about ASM, I forgot to mention I'm currently
> > coding an ASM version of image_setedges(), like Gruel
> > suggested... is someone also working on it?
>
> hm, maybe better don't do it (yet). I wrote a faster setedges() function
> months ago by simply replacing the memcpys and memsets with faster mmx
> equivalents. However I never commited this modification a) out of laziness
> and b) because pete planned to do the edge mirroring block based (as part of
> interpolate8x8_switch() before the interpolation/copy step...).
>
hmm, if I guess right, these fast memcpy funcs are like to:
a) use some non-temporal prefetch
b) have a lower limit on the size of the block they handle
with MMX/SSE (64bytes or something)
My feeling is that: point b) will prevent speed-up on
the left/right edges replication. And point a) will have
erratic result on top edge, since this part of the picture
is re-used almost at once when jumping on the ME (not quite
sure 'bout it, well...). I've played around with the
prefetch/movntq stuff, and am not really convinced.
Anyway, here's the function, just test it. It's just a
straightforward impl. of the loops required, no nifty
killah optims...
It gives the same encoded file size on my bitstream test
set, but:
- I don't have weird video sources with dimensions not
multiple of 8 (/16).
- I just happen to notice the EDGE_SIZE equals 32 (in image.h),
instead of the 16 I would have expected. Is there any reason
for this? Large search window in motion estimation?
Anyhow, my func only replicates an EDGE_SIZE of 16, not 32,
and hence gives different results than the C-one... I can
fix it, though, if EDGE_SIZE really should be 32. I'll be
slower, of course...
I put the corresponding bench entry for 'xvid_bench.c', too.
> So an image_setedges() replacement might not be a good idea because it might
> not be needed anymore sooner or later. But if you like, you could do the
> block based implementation instead ;-)
I've read the doc about it, and jumped on my Prozac pills
short after :))
bye,
Skal
Entry for xvid_bench.c:
/*********************************************************************
* test image_setedges_XXX()
*********************************************************************/
// should be moved to image/image.h...
extern void image_setedges_mmx(IMAGE * image,
uint32_t edged_width,
uint32_t edged_height,
uint32_t width,
uint32_t height,
uint32_t interlacing);
#define TEST_EDGE(FUNC) \
for(i=0; i<(int)sizeof(YU); ++i) \
YU[i] = (i*54+23)&0xff; \
t = gettime_usec(); \
for(tst=0; tst<nb_tests; ++tst) { \
(FUNC)(&YUV, EW, EH, W, H, 0); \
} \
emms(); \
t = (gettime_usec()-t ) / nb_tests; \
for(crc=0,i=0; i<(int)sizeof(YU); ++i) \
crc = ((crc+(YU[i]^i))^(crc>>8))&0xffff
void test_edges()
{
const int nb_tests = 10*speed_ref;
const int W = 320, H = 240;
const int EW = ((W+15)&~15) + 2*EDGE_SIZE;
const int EH = ((H+15)&~15) + 2*EDGE_SIZE;
const int YSize = EW*EH;
const int UVSize = (EW/2)*(EH/2);
const int offset = EDGE_SIZE + EDGE_SIZE*EW;
const int offset2 = EDGE_SIZE/2+(EDGE_SIZE/2)*(EW/2);
unsigned char YU[YSize+2*UVSize];
IMAGE YUV;
int i, crc, tst;
double t;
YUV.y = YU +offset;
YUV.u = YU+YSize +offset2;
YUV.v = YUV.u + UVSize;
printf( "\n ===== test edges =====\n" );
init_cpu(&cpu_list[0]); // PLAINC
TEST_EDGE(image_setedges);
printf( "image_setedges %.3f usec crc=0x%x\n", t, crc );
// if (crc!=0x52a7) printf( "*** CRC ERROR! ***\n" );
// the crc is meaningless for now, since EDGE_SIZE=32 in the C-version,
// and 16 in the MMX one. TODO: sort out this issue...
#if defined(ARCH_X86)
init_cpu(&cpu_list[1]); // MMX
TEST_EDGE(image_setedges_mmx);
printf( "image_setedges_mmx %.3f usec crc=0x%x\n", t, crc );
// if (crc!=0x52a7) printf( "*** CRC ERROR! ***\n" );
#endif
}
--=-T3ZzSrv5ga3wxewjct2o
Content-Disposition: attachment; filename=edge_mmx.asm
Content-Transfer-Encoding: quoted-printable
Content-Type: text/plain; name=edge_mmx.asm; charset=ISO-8859-1
;/*************************************************************************=
****
; *
; * XVID MPEG-4 VIDEO CODEC
; * edges replication
; *
; * Copyright(C) 2002 Pascal Massimino (skal@planet-d.net)
; *
; * This program is an implementation of a part of one or more MPEG-4
; * Video tools as specified in ISO/IEC 14496-2 standard. Those intending
; * to use this software module in hardware or software products are
; * advised that its use may infringe existing patents or copyrights, and
; * any such use would be at such party's own risk. The original
; * developer of this software module and his/her company, and subsequent
; * editors and their companies, will have no liability for use of this
; * software or modifications or derivatives thereof.
; *
; * This program is free software; you can redistribute it and/or modify
; * it under the terms of the GNU General Public License as published by
; * the Free Software Foundation; either version 2 of the License, or
; * (at your option) any later version.
; *
; * This program is distributed in the hope that it will be useful,
; * but WITHOUT ANY WARRANTY; without even the implied warranty of
; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
; * GNU General Public License for more details.
; *
; * You should have received a copy of the GNU General Public License
; * along with this program; if not, write to the Free Software
; * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 U=
SA
; *
; *************************************************************************=
***/
bits 32
%macro cglobal 1=20
%ifdef PREFIX
global _%1=20
%define %1 _%1
%else
global %1
%endif
%endmacro
section .text
cglobal image_setedges_mmx
;//////////////////////////////////////////////////////////////////////
;// void
;// image_setedges_mmx(IMAGE * image,
;// uint32_t edged_width,
;// uint32_t edged_height,
;// uint32_t width,
;// uint32_t height,
;// uint32_t interlacing)
;//////////////////////////////////////////////////////////////////////
%macro B_TO_Q 4 ; %1:SrcLeft, %2:Width, %3:mm#0, %4:mm#1
; replicate left and right bytes into quardwords
movq %4, [%1+%2-8] ; <=3D -8 offset: we align Right read (most common =
case)
movq %3, [%1]
punpckhbw %4,%4
punpcklbw %3,%3
pshufw %4,%4,0xff
pshufw %3,%3,0x00
%endmacro
%define IMAGE esp+4
%define EWIDTH esp+8
%define WIDTH esp+16
%define HEIGHT esp+20
%define IMAGE_Y 0 ; Warning! these offsets reflect the IMAGE struct field=
s.
%define IMAGE_U 4
%define IMAGE_V 8
align 16
image_setedges_mmx: ; approx.: ~20200c@512x384, ~21500c@640x352, ~13460c@35=
2x240
push ebx
push esi
push edi
push ebp
mov esi, [IMAGE +16]
mov ebp, [EWIDTH +16] ; BpS
; //// Y border loops ////
; 1) Left/Right =20
mov edx, [WIDTH +16] ; Width
mov edi, [HEIGHT +16] ; Height
mov ebx, [esi + IMAGE_Y] ; Image->y
jmp .Loop_LR_Y
align 16
.Loop_LR_Y
dec edi
B_TO_Q ebx, edx, mm0, mm1
movq [ebx-16], mm0
movq [ebx+edx], mm1
movq [ebx- 8], mm0
movq [ebx+edx+8], mm1
jle .Out_LR_Y
lea ebx, [ebx+ebp]
jmp .Loop_LR_Y
.Out_LR_Y
; 2) Bottom Y
; ebx is correctly positioned
lea ebx, [ebx+edx]
mov eax, 16
mov ecx, ebx
mov edi, -16
sub edi, edx ; edi=3D-16-Width
mov edx, edi
lea ebx, [ebx+ebp]
jmp .Loop_B_Y
align 16
.Loop_B_Y
add edx, 16
movq mm0, [ecx+edx-16 ]
movq mm1, [ecx+edx-16+8]
movq [ebx+edx-16 ], mm0
movq [ebx+edx-16+8], mm1
jle .Loop_B_Y
dec eax
mov edx, edi
jle .Out_B_Y
lea ebx, [ebx+ebp]
jmp .Loop_B_Y
.Out_B_Y:
; 3) Top Y
mov eax, [WIDTH +16]
xor ebp, -1
mov ebx, [esi+ IMAGE_Y]
inc ebp ; ebp =3D -BpS
add ebx, eax ; Y+Width
mov eax, 16
lea ecx, [ebx+ebp]
jmp .Loop_T_Y
align 16
.Loop_T_Y
add edx, 16
movq mm0, [ebx+edx-16 ]
movq mm1, [ebx+edx-16+8]
movq [ecx+edx-16 ], mm0
movq [ecx+edx-16+8], mm1
jle .Loop_T_Y
dec eax =20
jle .Out_T_Y
mov edx, edi =20
lea ecx, [ecx+ebp]
jmp .Loop_T_Y
.Out_T_Y
; //// UV border loops ////
; 1) left/right =20
mov esi, [IMAGE +16] ; image
mov edx, [WIDTH +16] ; Width
mov edi, [HEIGHT +16] ; Height
mov ebp, [EWIDTH +16] ; BpS
shr edx, 1 ; width/=3D2
shr edi, 1 ; Height/2
shr ebp, 1 ; BpS/2
mov ebx, [esi +IMAGE_U]; U
mov ecx, [esi +IMAGE_V]; V
jmp .Skip_LR_UV
align 16
.Loop_LR_UV
lea ebx, [ebx+ebp]
lea ecx, [ecx+ebp]
.Skip_LR_UV=20
B_TO_Q ebx, edx, mm0, mm1
B_TO_Q ecx, edx, mm2, mm3
movq [ebx-8], mm0
movq [ebx+edx+0], mm1
movq [ecx-8], mm2
movq [ecx+edx+0], mm3
dec edi
jg .Loop_LR_UV
; 2) Bottom UV=20
; ebx/ecx are correctly positioned
lea ebx, [ebx+edx]
lea ecx, [ecx+edx]
mov edi, ecx
mov esi, ebx
mov eax,-16
sub eax, edx ; eax =3D -16-Width/2
mov edx, 8
push eax
.Loop_B_UV_y
mov eax, [esp]
lea ebx, [ebx+ebp]
lea ecx, [ecx+ebp]
jmp .Loop_B_UV
align 16
.Loop_B_UV
add eax, 8
movq mm0, [esi+eax]
movq mm1, [edi+eax]
movq [ebx+eax], mm0
movq [ecx+eax], mm1
jl .Loop_B_UV
dec edx
jle .Out_B_UV
jmp .Loop_B_UV_y
.Out_B_UV:
; 3) Top UV
mov esi, [IMAGE +16+4] ; YVU
mov edx, [WIDTH +16+4] ; Width
mov edi, [esi +IMAGE_U] ; U
mov esi, [esi +IMAGE_V] ; V
shr edx, 1 ; Width/2
lea edi, [edi+edx]
lea esi, [esi+edx]
xor ebp, -1
mov ebx, esi
inc ebp ; ebp =3D -BpS
mov ecx, edi
mov edx, 8
.Loop_T_UV_y
mov eax, [esp]
lea ebx, [ebx+ebp]
lea ecx, [ecx+ebp]
jmp .Loop_T_UV
align 16
.Loop_T_UV
add eax, 8
movq mm0, [esi+eax]
movq mm1, [edi+eax]
movq [ebx+eax], mm0
movq [ecx+eax], mm1
jl .Loop_T_UV
dec edx
jg .Loop_T_UV_y
pop eax
pop ebp
pop edi
pop esi
pop ebx
emms ; TODO: really needed?
ret
--=-T3ZzSrv5ga3wxewjct2o--