[XviD-devel] Re: mrSAD

peter ross xvid-devel@xvid.org
Mon, 08 Jul 2002 16:29:47 +1000


syskin,

ive reordered the c loops to extract more performance (+~25%). the mmx 
version is ~3.5 times faster :-)

the functions "should" peform identical to your original ones; however 
please compare the output (eg. encoding a sample clip using plainc and mmx 
mrSAD, and compare the bitstreams).

let me know how it goes.

-- pete


// plain c code



static void __inline
sad16x8total_c(const uint8_t * reference, const int stride, int total[2])
{
	int i, j;

    total[0] = total[1] = 0;

     for (j = 0; j < 8; j++)
     {
             for (i = 0; i < 8; i++)
                 total[0] += (int)reference[i] ;

             for (i = 8; i < 16; i++)
                 total[1] += (int)reference[i];

             reference += stride;
    }
}



static uint32_t __inline
sad8x8mean_c(const uint8_t * current, const uint8_t * reference,
              const uint32_t stride, const int mean)
{
    int i,j,t;
    int sad = 0;

    for (j = 0; j < 8; j++)
    {
        reference -= stride;
        current -= stride;

        for (i = 0; i < 8 ; i++)
        {
            t = (int)current[i] - (int)reference[i] + mean;
            sad += ABS(t);
        }
    }

    return sad;
}


void
sad16x8total_mmx(const uint8_t * reference, const uint32_t stride, int 
total[2]);

uint32_t
sad8x8mean_mmx(const uint8_t * current, const uint8_t * reference,
              const uint32_t stride, const int mean);


void sad16x8_c(uint8_t * cur, uint8_t * ref, const int stride,
                int sad[2], const int means[2])
{
        const uint8_t *current = cur + 8*stride;
        const uint8_t *reference = ref;
        int mean[2];

        sad16x8total_c(reference, stride, mean);
        reference += stride * 8;

        mean[0] = (mean[0] - means[0])/64;
        mean[1] = (mean[1] - means[1])/64;

        sad[0] = MEAN_IMPORTANCE * ABS(mean[0]);
        sad[1] = MEAN_IMPORTANCE * ABS(mean[1]);

        sad[0] += sad8x8mean_c(current, reference, stride, mean[0]);
        sad[1] += sad8x8mean_c(current + 8, reference + 8, stride, mean[1]);
}



void sad16x8_mmx(uint8_t * cur, uint8_t * ref, const int stride,
                int sad[2], const int means[2])
{
        const uint8_t *current = cur + 8*stride;
        const uint8_t *reference = ref;
        int mean[2];

        sad16x8total_mmx(reference, stride, mean);
        reference += stride * 8;

        mean[0] = (mean[0] - means[0])/64;
        mean[1] = (mean[1] - means[1])/64;

        sad[0] = MEAN_IMPORTANCE * ABS(mean[0]);
        sad[1] = MEAN_IMPORTANCE * ABS(mean[1]);

        sad[0] += sad8x8mean_mmx(current, reference, stride, mean[0]);
        sad[1] += sad8x8mean_mmx(current + 8, reference + 8, stride, 
mean[1]);
}



// mmx code (simply paste at the eof of /src/motion/x86_asm/sad_mmx.asm)


cglobal  sad16x8total_mmx

;===========================================================================
;
; void
; sad16x8total_mmx(const uint8_t * reference, const uint32_t stride, int 
total[2]);
;
;===========================================================================

%macro SAD_16x8_TOTAL_MMX 1
    movq mm0, [eax]
    movq mm2, [eax+8]
    movq mm1,mm0
    movq mm3,mm2

%if %1 == 1
    lea eax,[eax+ecx]
%endif

    punpcklbw mm0,mm7
    punpckhbw mm1,mm7
    punpcklbw mm2,mm7
    punpckhbw mm3,mm7

    paddusw mm0,mm1
    paddusw mm4,mm0
    paddusw mm2,mm3
    paddusw mm6,mm2
%endmacro

align 16
sad16x8total_mmx:

    mov eax, [esp+ 4] ; ref ptr
    mov ecx, [esp+ 8] ; stride
    mov edx, [esp+12] ; total ptr

    pxor mm4, mm4 ; total0
    pxor mm6, mm6 ; total1
    pxor mm7, mm7 ; zero

    SAD_16x8_TOTAL_MMX 1
    SAD_16x8_TOTAL_MMX 1
    SAD_16x8_TOTAL_MMX 1
    SAD_16x8_TOTAL_MMX 1
    SAD_16x8_TOTAL_MMX 1
    SAD_16x8_TOTAL_MMX 1
    SAD_16x8_TOTAL_MMX 1
    SAD_16x8_TOTAL_MMX 0

    pmaddwd mm4, [mmx_one]	; collapse total0
    pmaddwd mm6, [mmx_one]	; collapse total1
    movq mm5, mm4
    movq mm7, mm6
    psrlq mm5, 32
    psrlq mm7, 32
    paddd mm4, mm5
    paddd mm6, mm7
    movd [edx], mm4			; total[0] = total0
    movd [edx + 4], mm6		; total[1] = total1

    ret
;===========================================================================





cglobal  sad8x8mean_mmx

;===========================================================================
; int32_t sad8x8mean_mmx(uint8_t * current,
;							uint8_t * reference,
;							const uint32_t stride,
;							int mean);
;===========================================================================

%macro SAD_8x8_MEAN_MMX 0
	sub eax, ecx
	sub edx, ecx

    movq mm0, [eax]			; cur
    movq mm2, [edx]			; ref
	movq mm1, mm0
	movq mm3, mm2
    punpcklbw mm0,mm7
    punpckhbw mm1,mm7
    punpcklbw mm2,mm7
    punpckhbw mm3,mm7

	psubw	mm0, mm2		; cur - ref
	psubw	mm1, mm3
	paddw	mm0, mm6		; += mean
	paddw	mm1, mm6

	pxor	mm2, mm2		; mm2 = 0
	pxor	mm3, mm3		;
	pcmpgtw	mm2, mm0		; mm2 = (0 > mm0)
	pcmpgtw	mm3, mm1		;
	pxor	mm0, mm2		; mm0 = |mm0|
	pxor	mm1, mm3		;
	psubw	mm0, mm2		; displace
	psubw	mm1, mm3		;

	paddw mm0, mm1			; sad += abs(cur - ref + mean)
	paddw mm5, mm0

%endmacro


align 16
sad8x8mean_mmx:
    mov eax, [esp+ 4] ; cur ptr
    mov edx, [esp+ 8] ; ref ptr
    mov ecx, [esp+12] ; stride

	movd mm6, [esp+16]	; mm6 = [mean|mean|mean|mean]
    movq mm0, mm6
    psllq mm0, 32
    por mm6, mm0
    movq mm0, mm6
    psllq mm0, 16
    por mm6, mm0

    pxor mm5, mm5 ; sad
    pxor mm7, mm7 ; zero

    SAD_8x8_MEAN_MMX
	SAD_8x8_MEAN_MMX
	SAD_8x8_MEAN_MMX
	SAD_8x8_MEAN_MMX
    SAD_8x8_MEAN_MMX
	SAD_8x8_MEAN_MMX
	SAD_8x8_MEAN_MMX
	SAD_8x8_MEAN_MMX

    pmaddwd mm5, [mmx_one] ; collapse sad
    movq mm6, mm5
    psrlq mm6, 32
    paddd mm5, mm6
    movd eax, mm5

    ret
;===========================================================================


_________________________________________________________________
Chat with friends online, try MSN Messenger: http://messenger.msn.com