[XviD-devel] Quality optimization

Thu Feb 27 16:30:45 CET 2003

	Hi,

On Wed, 2003-02-26 at 19:24, Christoph Lampert wrote:

> IDCT is 
> 
> PLAINC -  1.395 usec   (<- slower than fDCT?)

	most of the time, yes, because iDCT needs final [-256,255] 	clipping...

> MMX    -  0.219 usec
> MMXEXT -  0.199 usec
> SSE2   -  0.219 usec
> 3DNOW  -  1.247 usec
> 3DNOWE -  0.184 usec
> 
> whereas Hadamard is 
> 
> PLAINC  - 0.549 usec
> MMXEXT  - 0.089 usec
> 
> 0.089 is about the time of sad16() with MMXEXT needs, too,
> so a search routine based on hadamard+sad should not slow things down too
> much.

	That's not that easy :)
	In fact, having the Hadamard transform done doesn't mean
	you're off with the hard work. Taking the abs values with
	MMX is painful. For SSE, we have the mighty 'psadbw' instr.,
	but it works on 8bits data, whereas the output of Hadamard
	is 11bits (yes, it's scaled by 8, not 64!:). So? Should we
	descale 11->8bits? Another idea would be to multiply the
	Hadamard output by a pseudo-quant matrix that mimics the real
	quantizers (and the missing cosines, maybe)... Dunno.
	Anyway, here's an Hadamard_SAD for 8x8 or 16x16 byte input, 
	in replacement for SAD. I'm not quite satisfied with it
	mainly because of the above (and not just because it's
	8 times slower than pure SAD :)

	bye!

			Skal

I
> 
> 
> 
> 
> Btw. what we would need in the end is a SATD function (SAD of 
> transformed), so either, we would have to do 
> 
> SAD (  Hadamard(Cur) , Hadamard(Ref) )   (*)
> 
> with usual sad-routine or 
> 
> sum(abs( ( Hadamard( Cur - Ref ) ))      (**)
> 
> In theory these should be identical (Hadamard is linear), but maybe they
> are not...  
> 
> Anyway, would it be faster to combine these steps into a larger routine,
> or rather not? Again, I would believe it would, because for (**) the
> result of Hadamard doesn't have to be saved, only summed up, but of course
> I'm no expert... 

-------------- next part --------------

; int Skl_Hadamard_SAD_8x8_MMX(const uint8_t *Src1, const uint8_t *Src2, int BpS)
; int Skl_Hadamard_SAD_16x16_MMX(const uint8_t *Src1, const uint8_t *Src2, int BpS)

cglobal xvid_Hadamard_SAD_8x8_MMX
cglobal xvid_Hadamard_SAD_16x16_MMX

;//////////////////////////////////////////////////////////////////////

section .data

One:     times 8  dw 1     ; for summing 4 words

section .text

;//////////////////////////////////////////////////////////////////////
;// Hadamard SAD

%macro BUTF2 4   ; a, b, c, d
  paddw %2, %1   ; a+b
  paddw %4, %3   ; c+d
  paddw %1, %1   ; 2a
  paddw %3, %3   ; 2c
  psubw %1, %2   ; a-b
  psubw %3, %4   ; c-d
%endmacro

%macro ADD_ABS 2   ; %1/%2:in reg
  pxor    mm7, mm7
  pcmpgtw mm7, %1
  pxor    mm5, mm5
  psubw   mm6, mm7
  pcmpgtw mm5, %2
  psubw   mm6, mm5
  pxor    mm7, %1
  pxor    mm5, %2
  paddw   mm6, mm7
  paddw   mm6, mm5
%endmacro

;//////////////////////////////////////////////////////////////////////

%macro HADAMARD_SAD_HPASS 3   ; %1:dst offset1 %2:dst offset2  %3:src offset [eax=cur,ecx=ref]

    ; first, upload 8b->16b the diff Src1[i]-Src2[i]

  movd   mm0, [eax+%3]    ; [0123]
  punpcklbw mm0, mm6
  movd   mm2, [ecx+%3]    ; [0123]
  punpcklbw mm2, mm6
  movd   mm1, [eax+%3+4]  ; [4567]
  movd   mm3, [ecx+%3+4]  ; [4567]

  punpcklbw mm1, mm6
  punpcklbw mm3, mm6

  psubw mm0, mm2
  psubw mm1, mm3

    ; now, go with the transform

  movq  mm7, mm0
  paddw mm0, mm1      ; [abcd]
  psubw mm7, mm1      ; [efgh]

  movq      mm1,mm0
  punpcklwd mm0,mm7   ; [aebf]
  punpckhwd mm1,mm7   ; [cgdh]

  movq  mm7,mm0
  paddw mm0,mm1       ; [ABCD]
  psubw mm7,mm1       ; [EFGH]

  movq      mm1,mm0
  punpcklwd mm0,mm7   ; [ABEF]
  punpckhwd mm1,mm7   ; [CDGH]

  movq  mm7,mm0
  paddw mm0,mm1       ; [0312]
  psubw mm7,mm1       ; [7465]

  movq [esp+%1], mm0
  movq [esp+%2], mm7
%endmacro

%macro HADAMARD_SAD_VPASS 2   ; %1:src/dst,  %2:SAD
  movq  mm0, [%1+0*16]
  movq  mm1, [%1+1*16]
  movq  mm2, [%1+2*16]
  movq  mm3, [%1+3*16]
  movq  mm4, [%1+4*16]
  movq  mm5, [%1+5*16]
  movq  mm6, [%1+6*16]
  movq  mm7, [%1+7*16]

  BUTF2  mm0, mm1,  mm2, mm3
  BUTF2  mm1, mm3,  mm0, mm2

  BUTF2  mm4, mm5,  mm6, mm7
  BUTF2  mm4, mm6,  mm5, mm7

  BUTF2  mm3, mm7,  mm0, mm4
  BUTF2  mm2, mm6,  mm1, mm5

    ; time to sum up the abs val of mm0..mm7
    ; -> make room for 3 regs

  movq  [esp+0*16], mm7   ; Spill
  movq  [esp+1*16], mm6   ; ...
  movq  [esp+2*16], mm5   ; ...

  movq mm6, %2
  ADD_ABS mm0,mm1
  ADD_ABS mm2,mm3
  movq  mm0, [esp+0*16]
  movq  mm1, [esp+1*16]
  movq  mm2, [esp+2*16]
  ADD_ABS mm0, mm1
  ADD_ABS mm2, mm4
%endmacro

;//////////////////////////////////////////////////////////////////////

%define LOCAL_TMP_SIZE    16*16
%define SAD esp+LOCAL_TMP_SIZE

align 16
xvid_Hadamard_SAD_8x8_MMX:  ; 226c
  mov eax,[esp+ 4] ; Src1
  mov ecx,[esp+ 8] ; Src2
  mov edx,[esp+12] ; BpS
  push ebp
  mov ebp, esp
  lea esp, [esp-LOCAL_TMP_SIZE-16]
  and esp, ~0xf    ; align to 16b

  pxor mm6, mm6
  movq [SAD], mm6

  HADAMARD_SAD_HPASS 0*16, 8*16, 0
  HADAMARD_SAD_HPASS 1*16, 9*16, edx
  lea eax, [eax+2*edx]
  lea ecx, [ecx+2*edx]
  HADAMARD_SAD_HPASS 2*16, 10*16, 0
  HADAMARD_SAD_HPASS 3*16, 11*16, edx
  lea eax, [eax+2*edx]
  lea ecx, [ecx+2*edx]
  HADAMARD_SAD_HPASS 4*16, 12*16, 0
  HADAMARD_SAD_HPASS 5*16, 13*16, edx
  lea eax, [eax+2*edx]
  lea ecx, [ecx+2*edx]
  HADAMARD_SAD_HPASS 6*16, 14*16, 0
  HADAMARD_SAD_HPASS 7*16, 15*16, edx

  HADAMARD_SAD_VPASS esp,      [SAD]
  movq [SAD], mm6    ; save intermediate SAD
  HADAMARD_SAD_VPASS esp+8*16, [SAD]

    ; mm6 = [SAD]. Now, collapse it.

  pmaddwd mm6, [One]
  movq mm7, mm6
  psrlq mm6, 32
  mov esp, ebp
  paddd mm6, mm7
  pop ebp
  movd eax, mm6
  ret

%undef LOCAL_TMP_SIZE
%undef SAD

;//////////////////////////////////////////////////////////////////////

%define LOCAL_TMP_SIZE    64*16
%define SAD esp+LOCAL_TMP_SIZE
%define SAD2 SAD+8

align 16
xvid_Hadamard_SAD_16x16_MMX:  ; 831c
  mov eax,[esp+ 4] ; Src1
  mov ecx,[esp+ 8] ; Src2
  mov edx,[esp+12] ; BpS
  push ebp
  mov ebp, esp
  lea esp, [esp-LOCAL_TMP_SIZE-16]
  and esp, ~0xf    ; align to 16b

  pxor mm6, mm6
  movq [SAD], mm6

  HADAMARD_SAD_HPASS  0*16,  8*16, 0
  HADAMARD_SAD_HPASS 16*16, 24*16, 8
  HADAMARD_SAD_HPASS  1*16,  9*16, edx
  HADAMARD_SAD_HPASS 17*16, 25*16, edx+8
  lea eax, [eax+2*edx]
  lea ecx, [ecx+2*edx]
  HADAMARD_SAD_HPASS  2*16, 10*16, 0
  HADAMARD_SAD_HPASS 18*16, 26*16, 8
  HADAMARD_SAD_HPASS  3*16, 11*16, edx
  HADAMARD_SAD_HPASS 19*16, 27*16, edx+8
  lea eax, [eax+2*edx]
  lea ecx, [ecx+2*edx]
  HADAMARD_SAD_HPASS  4*16, 12*16, 0
  HADAMARD_SAD_HPASS 20*16, 28*16, 8
  HADAMARD_SAD_HPASS  5*16, 13*16, edx
  HADAMARD_SAD_HPASS 21*16, 29*16, edx+8
  lea eax, [eax+2*edx]
  lea ecx, [ecx+2*edx]
  HADAMARD_SAD_HPASS  6*16, 14*16, 0
  HADAMARD_SAD_HPASS 22*16, 30*16, 8
  HADAMARD_SAD_HPASS  7*16, 15*16, edx
  HADAMARD_SAD_HPASS 23*16, 31*16, edx+8
  lea eax, [eax+2*edx]
  lea ecx, [ecx+2*edx]
  HADAMARD_SAD_HPASS 32*16, 40*16, 0
  HADAMARD_SAD_HPASS 48*16, 56*16, 8
  HADAMARD_SAD_HPASS 33*16, 41*16, edx
  HADAMARD_SAD_HPASS 49*16, 57*16, edx+8
  lea eax, [eax+2*edx]
  lea ecx, [ecx+2*edx]
  HADAMARD_SAD_HPASS 34*16, 42*16, 0
  HADAMARD_SAD_HPASS 50*16, 58*16, 8
  HADAMARD_SAD_HPASS 35*16, 43*16, edx
  HADAMARD_SAD_HPASS 51*16, 59*16, edx+8
  lea eax, [eax+2*edx]
  lea ecx, [ecx+2*edx]
  HADAMARD_SAD_HPASS 36*16, 44*16, 0
  HADAMARD_SAD_HPASS 52*16, 60*16, 8
  HADAMARD_SAD_HPASS 37*16, 45*16, edx
  HADAMARD_SAD_HPASS 53*16, 61*16, edx+8
  lea eax, [eax+2*edx]
  lea ecx, [ecx+2*edx]
  HADAMARD_SAD_HPASS 38*16, 46*16, 0
  HADAMARD_SAD_HPASS 54*16, 62*16, 8
  HADAMARD_SAD_HPASS 39*16, 47*16, edx
  HADAMARD_SAD_HPASS 55*16, 63*16, edx+8

  HADAMARD_SAD_VPASS esp+ 0*16, [SAD]
  movq [SAD], mm6    ; save intermediate SAD
  HADAMARD_SAD_VPASS esp+ 8*16, [SAD]
  movq [SAD], mm6
  HADAMARD_SAD_VPASS esp+16*16, [SAD]
  movq [SAD], mm6
  HADAMARD_SAD_VPASS esp+24*16, [SAD]

    ; we need to split SAD accums in two, because of
    ; overflow... Store partially collapsed current SAD
  pmaddwd mm6, [One]
  pxor mm7, mm7
  movq [SAD], mm6

  movq [SAD2], mm7
  HADAMARD_SAD_VPASS esp+32*16, [SAD2]
  movq [SAD2], mm6
  HADAMARD_SAD_VPASS esp+40*16, [SAD2]
  movq [SAD2], mm6
  HADAMARD_SAD_VPASS esp+48*16, [SAD2]
  movq [SAD2], mm6
  HADAMARD_SAD_VPASS esp+56*16, [SAD2]

    ; mm6 = [SAD2]. Now, collapse it (with [SAD]).

  pmaddwd mm6, [One]
  paddd   mm6, [SAD]
  movq mm7, mm6
  psrlq   mm6, 32
  mov esp, ebp
  paddd   mm7, mm6
  pop ebp
  movd    eax, mm7
  ret

%undef LOCAL_TMP_SIZE
%undef SAD

;//////////////////////////////////////////////////////////////////////
-------------- next part --------------
A non-text attachment was scrubbed...
Name: skl_hadamard.c
Type: text/x-c
Size: 5309 bytes
Desc: not available
Url : http://edu.bnhof.de/pipermail/xvid-devel/attachments/20030227/1477908d/skl_hadamard.bin