[XviD-devel] mmx transfer8x8/16x16 optimization, setedges, etc.
peter ross
xvid-devel@xvid.org
Tue, 16 Jul 2002 10:02:38 +1000
at the request of alban, ive assembled two new functions. i say assembled
because it basically involved copying and pasting that "skals" chap's code.
xvid currently has a transfer8x8_copy_[c,mmx] function, which copy an 8x8
block from one buffer to another. this function is limited by the assumption
that both buffers have the same stride. the new functions,
transfer8x8copy_[c,mmx] and transfer8x8copy_[c,mmx] provide stride arguments
for both destination and source buffers.
NOW, on this machine (p3,800mhz) the new 8x8copy function is *1* cycle
slower than current 8x8_copy function. so i reckon we can replace it with
the new one (rather than maintaining two pieces of near identical code). any
opinions?
speedwise, the mmx8x8 copy is ~25% faster than the c/memcpy version. the
mmx16x16 copy is ~75% faster than the c/memcpy version.
alban this should speed up your ouput_mb function a little. AND, the
16x16copy can now replace the four 8x8copy's used to copy not-coded mbs (in
decoder_pframe function). please perform some tests alban and post the
results. Note: *I* will commit this code in a day-or-so. btw, the 16x16
could also be sse2'd.
alban and i were the possibility of "on-the-fly" set_edges for decoding. why
bother? well, there's two reasons.
1. potential for added decoding speed. i suspect that the edges are rarely
used, so by not calling image_setedges we might save some time.
2. support for mode-1 direct rending. in this mode, xvid uses the VIDEO-RAM
as it's internal reference-frame buffer. the problem is doing so is that
VIDEo-RAM has no room for edges (hence the need for on-the-fly).
also, i've been sitting on ircnet#xvid a little bit lately. syskin & mf
asked that you, the dev team, drop by some time for a chin wag (that's slang
for CHAT).
-- pete
;----
void transfer8x8copy_c(uint8_t * dst,
const uint8_t * src,
const uint32_t dst_stride,
const uint32_t src_stride)
{
int j;
for (j = 0; j < 8; j++)
{
memcpy(dst, src, 8);
dst += dst_stride;
src += src_stride;
}
}
void transfer16x16copy_c(uint8_t * dst,
const uint8_t * src,
const uint32_t dst_stride,
const uint32_t src_stride)
{
int j;
for (j = 0; j < 16; j++)
{
memcpy(dst, src, 16);
dst += dst_stride;
src += src_stride;
}
}
cglobal transfer8x8copy_mmx
;===========================================================================
;
; void transfer8x8copy_mmx(uint8_t * const dst,
; const uint8_t * const src,
; const uint32_t dst_stride,
; const uint32_t src_stride);
;
;
;===========================================================================
%macro COPY_8x8_MMX 0
movq mm0, [eax]
movq mm1, [eax+ebx]
movq [ecx], mm0
lea eax,[eax+2*ebx]
movq [ecx+edx], mm1
lea ecx,[ecx+2*edx]
%endmacro
align 16
transfer8x8copy_mmx:
push ebx
mov ecx, [esp+4+ 4] ; Dst
mov eax, [esp+4+ 8] ; Src
mov edx, [esp+4+12] ; dst-stride
mov ebx, [esp+4+16] ; src-stride
COPY_8x8_MMX
COPY_8x8_MMX
COPY_8x8_MMX
COPY_8x8_MMX
pop ebx
ret
cglobal transfer16x16copy_mmx
;===========================================================================
;
; void transfer16x16copy_mmx(uint8_t * const dst,
; const uint8_t * const src,
; const uint32_t dst_stride,
; const uint32_t src_stride);
;
;
;===========================================================================
%macro COPY_16x16_MMX 0
movq mm0, [eax]
movq mm1, [eax + 8]
movq mm2, [eax+ebx]
movq mm3, [eax+ebx + 8]
movq [ecx], mm0
lea eax,[eax+2*ebx]
movq [ecx + 8], mm1
movq [ecx+edx], mm2
movq [ecx+edx+8], mm3
lea ecx,[ecx+2*edx]
%endmacro
align 16
transfer16x16copy_mmx:
push ebx
mov ecx, [esp+4+ 4] ; Dst
mov eax, [esp+4+ 8] ; Src
mov edx, [esp+4+12] ; dst-stride
mov ebx, [esp+4+16] ; src-stride
COPY_16x16_MMX
COPY_16x16_MMX
COPY_16x16_MMX
COPY_16x16_MMX
COPY_16x16_MMX
COPY_16x16_MMX
COPY_16x16_MMX
COPY_16x16_MMX
pop ebx
ret
;---
_________________________________________________________________
Chat with friends online, try MSN Messenger: http://messenger.msn.com