Re[2]: [XviD-devel] 15% faster search16, if anyone is interested
Radoslaw Czyz
xvid-devel@xvid.org
Fri, 12 Jul 2002 19:14:19 +0930
> I just tried this:
> ---------------------------------------
> int32_t
> sad8x4_c(const uint8_t * const cur,
> const uint8_t * const ref,
> const uint32_t stride,
> const uint32_t best_sad)
> {
> int sad = sad8(cur,ref,stride);
> if (sad<best_sad)
> sad += sad8(cur+8,ref+8,stride);
> if (sad<best_sad)
> sad += sad8(cur+8*stride,ref+8*stride,stride);
> if (sad<best_sad)
> sad += sad8(cur+8*stride+8,ref+8*stride+8,stride);
> return sad;
> }
CL> (where sad8 = sad8_xmm) and it was 4-5% slower than sad16_xmm.
I already explained that I was wrong about this particular thing, but
I'll paste my code if you want to see it:
/* instead of macro */
#define CHECK_CANDIDATE(X,Y,D) { \
(*CheckCandidate)((X), (Y), medianMV, currentMV, maximums, iMinSAD, \
cur, Ref, &iDirection, (D), iQuant, iEdgedWidth, iFcode, inter4v); }
/* and a function */
void CheckCandidate16(int x, int y, const VECTOR medianMV, VECTOR currentMV[],
const int maximums[], int32_t iMinSAD[], const uint8_t * Cur,
const uint8_t * ref[], int *dir, const int Direction,
const uint32_t iQuant, const uint32_t stride,
const uint32_t iFcode, const int inter4v)
{
int32_t sad[5], i;
uint8_t * Reference;
uint8_t * Current = Cur;
if (( x > maximums[1]) || ( x < maximums[0])
|| ( y > maximums[3]) || (y < maximums[2])) return;
switch ( ((x&1)<<1) + (y&1) )
{
case 0 : Reference = ref[0] + x/2 + (y/2)*stride; break;
case 1 : Reference = ref[2] + x/2 + ((y-1)/2)*stride; break;
case 2 : Reference = ref[1] + (x-1)/2 + (y/2)*stride; break;
default :
case 3 : Reference = ref[3] + (x-1)/2 + ((y-1)/2)*stride; break;
}
sad[0] = calc_delta_16(x - medianMV.x, y - medianMV.y, iFcode, iQuant);
sad[0] += sad[1] = sad8(Current, Reference, stride);
if (!(inter4v)) { if (sad[0] >= iMinSAD[0]) return; }
else sad[1] += calc_delta_8(x - medianMV.x, y - medianMV.y, iFcode, iQuant);
sad[0] += sad[2] = sad8(Current + 8, Reference + 8, stride);
if (!(inter4v)) if (sad[0] >= iMinSAD[0]) return;
Reference += 8*stride; Current += 8*stride;
sad[0] += sad[3] = sad8(Current, Reference, stride);
if (!(inter4v)) if (sad[0] >= iMinSAD[0]) return;
sad[0] += sad[4] = sad8(Current + 8, Reference + 8, stride);
if (sad[0] < iMinSAD[0]) {
iMinSAD[0] = sad[0]; currentMV[0].x = x; currentMV[0].y = y; *dir = Direction; }
if (inter4v)
for (i = 1; i<5; i++)
if (sad[i] < iMinSAD[i]) {
iMinSAD[i]=sad[i]; (currentMV+i)->x = x; (currentMV+i)->y = y; }
}