[XviD-devel] Re: mrSAD
peter ross
xvid-devel@xvid.org
Mon, 08 Jul 2002 16:29:47 +1000
syskin,
ive reordered the c loops to extract more performance (+~25%). the mmx
version is ~3.5 times faster :-)
the functions "should" peform identical to your original ones; however
please compare the output (eg. encoding a sample clip using plainc and mmx
mrSAD, and compare the bitstreams).
let me know how it goes.
-- pete
// plain c code
static void __inline
sad16x8total_c(const uint8_t * reference, const int stride, int total[2])
{
int i, j;
total[0] = total[1] = 0;
for (j = 0; j < 8; j++)
{
for (i = 0; i < 8; i++)
total[0] += (int)reference[i] ;
for (i = 8; i < 16; i++)
total[1] += (int)reference[i];
reference += stride;
}
}
static uint32_t __inline
sad8x8mean_c(const uint8_t * current, const uint8_t * reference,
const uint32_t stride, const int mean)
{
int i,j,t;
int sad = 0;
for (j = 0; j < 8; j++)
{
reference -= stride;
current -= stride;
for (i = 0; i < 8 ; i++)
{
t = (int)current[i] - (int)reference[i] + mean;
sad += ABS(t);
}
}
return sad;
}
void
sad16x8total_mmx(const uint8_t * reference, const uint32_t stride, int
total[2]);
uint32_t
sad8x8mean_mmx(const uint8_t * current, const uint8_t * reference,
const uint32_t stride, const int mean);
void sad16x8_c(uint8_t * cur, uint8_t * ref, const int stride,
int sad[2], const int means[2])
{
const uint8_t *current = cur + 8*stride;
const uint8_t *reference = ref;
int mean[2];
sad16x8total_c(reference, stride, mean);
reference += stride * 8;
mean[0] = (mean[0] - means[0])/64;
mean[1] = (mean[1] - means[1])/64;
sad[0] = MEAN_IMPORTANCE * ABS(mean[0]);
sad[1] = MEAN_IMPORTANCE * ABS(mean[1]);
sad[0] += sad8x8mean_c(current, reference, stride, mean[0]);
sad[1] += sad8x8mean_c(current + 8, reference + 8, stride, mean[1]);
}
void sad16x8_mmx(uint8_t * cur, uint8_t * ref, const int stride,
int sad[2], const int means[2])
{
const uint8_t *current = cur + 8*stride;
const uint8_t *reference = ref;
int mean[2];
sad16x8total_mmx(reference, stride, mean);
reference += stride * 8;
mean[0] = (mean[0] - means[0])/64;
mean[1] = (mean[1] - means[1])/64;
sad[0] = MEAN_IMPORTANCE * ABS(mean[0]);
sad[1] = MEAN_IMPORTANCE * ABS(mean[1]);
sad[0] += sad8x8mean_mmx(current, reference, stride, mean[0]);
sad[1] += sad8x8mean_mmx(current + 8, reference + 8, stride,
mean[1]);
}
// mmx code (simply paste at the eof of /src/motion/x86_asm/sad_mmx.asm)
cglobal sad16x8total_mmx
;===========================================================================
;
; void
; sad16x8total_mmx(const uint8_t * reference, const uint32_t stride, int
total[2]);
;
;===========================================================================
%macro SAD_16x8_TOTAL_MMX 1
movq mm0, [eax]
movq mm2, [eax+8]
movq mm1,mm0
movq mm3,mm2
%if %1 == 1
lea eax,[eax+ecx]
%endif
punpcklbw mm0,mm7
punpckhbw mm1,mm7
punpcklbw mm2,mm7
punpckhbw mm3,mm7
paddusw mm0,mm1
paddusw mm4,mm0
paddusw mm2,mm3
paddusw mm6,mm2
%endmacro
align 16
sad16x8total_mmx:
mov eax, [esp+ 4] ; ref ptr
mov ecx, [esp+ 8] ; stride
mov edx, [esp+12] ; total ptr
pxor mm4, mm4 ; total0
pxor mm6, mm6 ; total1
pxor mm7, mm7 ; zero
SAD_16x8_TOTAL_MMX 1
SAD_16x8_TOTAL_MMX 1
SAD_16x8_TOTAL_MMX 1
SAD_16x8_TOTAL_MMX 1
SAD_16x8_TOTAL_MMX 1
SAD_16x8_TOTAL_MMX 1
SAD_16x8_TOTAL_MMX 1
SAD_16x8_TOTAL_MMX 0
pmaddwd mm4, [mmx_one] ; collapse total0
pmaddwd mm6, [mmx_one] ; collapse total1
movq mm5, mm4
movq mm7, mm6
psrlq mm5, 32
psrlq mm7, 32
paddd mm4, mm5
paddd mm6, mm7
movd [edx], mm4 ; total[0] = total0
movd [edx + 4], mm6 ; total[1] = total1
ret
;===========================================================================
cglobal sad8x8mean_mmx
;===========================================================================
; int32_t sad8x8mean_mmx(uint8_t * current,
; uint8_t * reference,
; const uint32_t stride,
; int mean);
;===========================================================================
%macro SAD_8x8_MEAN_MMX 0
sub eax, ecx
sub edx, ecx
movq mm0, [eax] ; cur
movq mm2, [edx] ; ref
movq mm1, mm0
movq mm3, mm2
punpcklbw mm0,mm7
punpckhbw mm1,mm7
punpcklbw mm2,mm7
punpckhbw mm3,mm7
psubw mm0, mm2 ; cur - ref
psubw mm1, mm3
paddw mm0, mm6 ; += mean
paddw mm1, mm6
pxor mm2, mm2 ; mm2 = 0
pxor mm3, mm3 ;
pcmpgtw mm2, mm0 ; mm2 = (0 > mm0)
pcmpgtw mm3, mm1 ;
pxor mm0, mm2 ; mm0 = |mm0|
pxor mm1, mm3 ;
psubw mm0, mm2 ; displace
psubw mm1, mm3 ;
paddw mm0, mm1 ; sad += abs(cur - ref + mean)
paddw mm5, mm0
%endmacro
align 16
sad8x8mean_mmx:
mov eax, [esp+ 4] ; cur ptr
mov edx, [esp+ 8] ; ref ptr
mov ecx, [esp+12] ; stride
movd mm6, [esp+16] ; mm6 = [mean|mean|mean|mean]
movq mm0, mm6
psllq mm0, 32
por mm6, mm0
movq mm0, mm6
psllq mm0, 16
por mm6, mm0
pxor mm5, mm5 ; sad
pxor mm7, mm7 ; zero
SAD_8x8_MEAN_MMX
SAD_8x8_MEAN_MMX
SAD_8x8_MEAN_MMX
SAD_8x8_MEAN_MMX
SAD_8x8_MEAN_MMX
SAD_8x8_MEAN_MMX
SAD_8x8_MEAN_MMX
SAD_8x8_MEAN_MMX
pmaddwd mm5, [mmx_one] ; collapse sad
movq mm6, mm5
psrlq mm6, 32
paddd mm5, mm6
movd eax, mm5
ret
;===========================================================================
_________________________________________________________________
Chat with friends online, try MSN Messenger: http://messenger.msn.com