Re[2]: [XviD-devel] 15% faster search16, if anyone is interested

Radoslaw Czyz xvid-devel@xvid.org
Fri, 12 Jul 2002 19:14:19 +0930


> I just tried this:
> ---------------------------------------
> int32_t
> sad8x4_c(const uint8_t * const cur,
>                   const uint8_t * const ref,
>                   const uint32_t stride,
>                   const uint32_t best_sad)
> {
>         int sad = sad8(cur,ref,stride);
>         if (sad<best_sad)
>                 sad += sad8(cur+8,ref+8,stride);
>         if (sad<best_sad)
>                 sad += sad8(cur+8*stride,ref+8*stride,stride);
>         if (sad<best_sad)
>                 sad += sad8(cur+8*stride+8,ref+8*stride+8,stride);

>         return sad;

> }

CL> (where sad8 = sad8_xmm) and it was 4-5% slower than sad16_xmm.

I already explained that I was wrong about this particular thing, but
I'll paste my code if you want to see it:

/* instead of macro */
#define CHECK_CANDIDATE(X,Y,D) { \
(*CheckCandidate)((X), (Y), medianMV, currentMV, maximums, iMinSAD, \
                cur, Ref, &iDirection, (D), iQuant, iEdgedWidth, iFcode, inter4v); }

/* and a function */
void CheckCandidate16(int x, int y, const VECTOR medianMV, VECTOR currentMV[],
                                        const int maximums[], int32_t iMinSAD[], const uint8_t * Cur,
                                        const uint8_t * ref[],  int *dir, const int Direction,
                                        const uint32_t iQuant, const uint32_t stride,
                                        const uint32_t iFcode, const int inter4v)
{
        int32_t sad[5], i;
        uint8_t * Reference;
        uint8_t * Current = Cur;

        if (( x > maximums[1]) || ( x < maximums[0])
                || ( y > maximums[3]) || (y < maximums[2])) return;

        switch ( ((x&1)<<1) + (y&1) )
        {
                case 0 : Reference = ref[0] + x/2 + (y/2)*stride; break;
                case 1 : Reference = ref[2] + x/2 + ((y-1)/2)*stride; break;
                case 2 : Reference = ref[1] + (x-1)/2 + (y/2)*stride; break;
                default :
                case 3 : Reference = ref[3] + (x-1)/2 + ((y-1)/2)*stride; break;
        }

        sad[0] = calc_delta_16(x - medianMV.x, y - medianMV.y, iFcode, iQuant);

        sad[0] += sad[1] = sad8(Current, Reference, stride);
        if (!(inter4v)) { if (sad[0] >= iMinSAD[0]) return; }
        else sad[1] += calc_delta_8(x - medianMV.x, y - medianMV.y, iFcode, iQuant);

        sad[0] += sad[2] = sad8(Current + 8, Reference + 8, stride);
        if (!(inter4v)) if (sad[0] >= iMinSAD[0]) return;

        Reference += 8*stride; Current += 8*stride;

        sad[0] += sad[3] = sad8(Current, Reference, stride);
        if (!(inter4v)) if (sad[0] >= iMinSAD[0]) return;

        sad[0] += sad[4] = sad8(Current + 8, Reference + 8, stride);

        if (sad[0] < iMinSAD[0]) {
                iMinSAD[0] = sad[0]; currentMV[0].x = x; currentMV[0].y = y; *dir = Direction; }

        if (inter4v)
                for (i = 1; i<5; i++)
                        if (sad[i] < iMinSAD[i]) {
                                iMinSAD[i]=sad[i]; (currentMV+i)->x = x; (currentMV+i)->y = y; }

}