[XviD-devel] Quality optimization
skal
skal at planet-d.net
Thu Feb 27 16:30:45 CET 2003
Hi,
On Wed, 2003-02-26 at 19:24, Christoph Lampert wrote:
> IDCT is
>
> PLAINC - 1.395 usec (<- slower than fDCT?)
most of the time, yes, because iDCT needs final [-256,255] clipping...
> MMX - 0.219 usec
> MMXEXT - 0.199 usec
> SSE2 - 0.219 usec
> 3DNOW - 1.247 usec
> 3DNOWE - 0.184 usec
>
> whereas Hadamard is
>
> PLAINC - 0.549 usec
> MMXEXT - 0.089 usec
>
> 0.089 is about the time of sad16() with MMXEXT needs, too,
> so a search routine based on hadamard+sad should not slow things down too
> much.
That's not that easy :)
In fact, having the Hadamard transform done doesn't mean
you're off with the hard work. Taking the abs values with
MMX is painful. For SSE, we have the mighty 'psadbw' instr.,
but it works on 8bits data, whereas the output of Hadamard
is 11bits (yes, it's scaled by 8, not 64!:). So? Should we
descale 11->8bits? Another idea would be to multiply the
Hadamard output by a pseudo-quant matrix that mimics the real
quantizers (and the missing cosines, maybe)... Dunno.
Anyway, here's an Hadamard_SAD for 8x8 or 16x16 byte input,
in replacement for SAD. I'm not quite satisfied with it
mainly because of the above (and not just because it's
8 times slower than pure SAD :)
bye!
Skal
I
>
>
>
>
> Btw. what we would need in the end is a SATD function (SAD of
> transformed), so either, we would have to do
>
> SAD ( Hadamard(Cur) , Hadamard(Ref) ) (*)
>
> with usual sad-routine or
>
> sum(abs( ( Hadamard( Cur - Ref ) )) (**)
>
> In theory these should be identical (Hadamard is linear), but maybe they
> are not...
>
> Anyway, would it be faster to combine these steps into a larger routine,
> or rather not? Again, I would believe it would, because for (**) the
> result of Hadamard doesn't have to be saved, only summed up, but of course
> I'm no expert...
-------------- next part --------------
; int Skl_Hadamard_SAD_8x8_MMX(const uint8_t *Src1, const uint8_t *Src2, int BpS)
; int Skl_Hadamard_SAD_16x16_MMX(const uint8_t *Src1, const uint8_t *Src2, int BpS)
cglobal xvid_Hadamard_SAD_8x8_MMX
cglobal xvid_Hadamard_SAD_16x16_MMX
;//////////////////////////////////////////////////////////////////////
section .data
One: times 8 dw 1 ; for summing 4 words
section .text
;//////////////////////////////////////////////////////////////////////
;// Hadamard SAD
%macro BUTF2 4 ; a, b, c, d
paddw %2, %1 ; a+b
paddw %4, %3 ; c+d
paddw %1, %1 ; 2a
paddw %3, %3 ; 2c
psubw %1, %2 ; a-b
psubw %3, %4 ; c-d
%endmacro
%macro ADD_ABS 2 ; %1/%2:in reg
pxor mm7, mm7
pcmpgtw mm7, %1
pxor mm5, mm5
psubw mm6, mm7
pcmpgtw mm5, %2
psubw mm6, mm5
pxor mm7, %1
pxor mm5, %2
paddw mm6, mm7
paddw mm6, mm5
%endmacro
;//////////////////////////////////////////////////////////////////////
%macro HADAMARD_SAD_HPASS 3 ; %1:dst offset1 %2:dst offset2 %3:src offset [eax=cur,ecx=ref]
; first, upload 8b->16b the diff Src1[i]-Src2[i]
movd mm0, [eax+%3] ; [0123]
punpcklbw mm0, mm6
movd mm2, [ecx+%3] ; [0123]
punpcklbw mm2, mm6
movd mm1, [eax+%3+4] ; [4567]
movd mm3, [ecx+%3+4] ; [4567]
punpcklbw mm1, mm6
punpcklbw mm3, mm6
psubw mm0, mm2
psubw mm1, mm3
; now, go with the transform
movq mm7, mm0
paddw mm0, mm1 ; [abcd]
psubw mm7, mm1 ; [efgh]
movq mm1,mm0
punpcklwd mm0,mm7 ; [aebf]
punpckhwd mm1,mm7 ; [cgdh]
movq mm7,mm0
paddw mm0,mm1 ; [ABCD]
psubw mm7,mm1 ; [EFGH]
movq mm1,mm0
punpcklwd mm0,mm7 ; [ABEF]
punpckhwd mm1,mm7 ; [CDGH]
movq mm7,mm0
paddw mm0,mm1 ; [0312]
psubw mm7,mm1 ; [7465]
movq [esp+%1], mm0
movq [esp+%2], mm7
%endmacro
%macro HADAMARD_SAD_VPASS 2 ; %1:src/dst, %2:SAD
movq mm0, [%1+0*16]
movq mm1, [%1+1*16]
movq mm2, [%1+2*16]
movq mm3, [%1+3*16]
movq mm4, [%1+4*16]
movq mm5, [%1+5*16]
movq mm6, [%1+6*16]
movq mm7, [%1+7*16]
BUTF2 mm0, mm1, mm2, mm3
BUTF2 mm1, mm3, mm0, mm2
BUTF2 mm4, mm5, mm6, mm7
BUTF2 mm4, mm6, mm5, mm7
BUTF2 mm3, mm7, mm0, mm4
BUTF2 mm2, mm6, mm1, mm5
; time to sum up the abs val of mm0..mm7
; -> make room for 3 regs
movq [esp+0*16], mm7 ; Spill
movq [esp+1*16], mm6 ; ...
movq [esp+2*16], mm5 ; ...
movq mm6, %2
ADD_ABS mm0,mm1
ADD_ABS mm2,mm3
movq mm0, [esp+0*16]
movq mm1, [esp+1*16]
movq mm2, [esp+2*16]
ADD_ABS mm0, mm1
ADD_ABS mm2, mm4
%endmacro
;//////////////////////////////////////////////////////////////////////
%define LOCAL_TMP_SIZE 16*16
%define SAD esp+LOCAL_TMP_SIZE
align 16
xvid_Hadamard_SAD_8x8_MMX: ; 226c
mov eax,[esp+ 4] ; Src1
mov ecx,[esp+ 8] ; Src2
mov edx,[esp+12] ; BpS
push ebp
mov ebp, esp
lea esp, [esp-LOCAL_TMP_SIZE-16]
and esp, ~0xf ; align to 16b
pxor mm6, mm6
movq [SAD], mm6
HADAMARD_SAD_HPASS 0*16, 8*16, 0
HADAMARD_SAD_HPASS 1*16, 9*16, edx
lea eax, [eax+2*edx]
lea ecx, [ecx+2*edx]
HADAMARD_SAD_HPASS 2*16, 10*16, 0
HADAMARD_SAD_HPASS 3*16, 11*16, edx
lea eax, [eax+2*edx]
lea ecx, [ecx+2*edx]
HADAMARD_SAD_HPASS 4*16, 12*16, 0
HADAMARD_SAD_HPASS 5*16, 13*16, edx
lea eax, [eax+2*edx]
lea ecx, [ecx+2*edx]
HADAMARD_SAD_HPASS 6*16, 14*16, 0
HADAMARD_SAD_HPASS 7*16, 15*16, edx
HADAMARD_SAD_VPASS esp, [SAD]
movq [SAD], mm6 ; save intermediate SAD
HADAMARD_SAD_VPASS esp+8*16, [SAD]
; mm6 = [SAD]. Now, collapse it.
pmaddwd mm6, [One]
movq mm7, mm6
psrlq mm6, 32
mov esp, ebp
paddd mm6, mm7
pop ebp
movd eax, mm6
ret
%undef LOCAL_TMP_SIZE
%undef SAD
;//////////////////////////////////////////////////////////////////////
%define LOCAL_TMP_SIZE 64*16
%define SAD esp+LOCAL_TMP_SIZE
%define SAD2 SAD+8
align 16
xvid_Hadamard_SAD_16x16_MMX: ; 831c
mov eax,[esp+ 4] ; Src1
mov ecx,[esp+ 8] ; Src2
mov edx,[esp+12] ; BpS
push ebp
mov ebp, esp
lea esp, [esp-LOCAL_TMP_SIZE-16]
and esp, ~0xf ; align to 16b
pxor mm6, mm6
movq [SAD], mm6
HADAMARD_SAD_HPASS 0*16, 8*16, 0
HADAMARD_SAD_HPASS 16*16, 24*16, 8
HADAMARD_SAD_HPASS 1*16, 9*16, edx
HADAMARD_SAD_HPASS 17*16, 25*16, edx+8
lea eax, [eax+2*edx]
lea ecx, [ecx+2*edx]
HADAMARD_SAD_HPASS 2*16, 10*16, 0
HADAMARD_SAD_HPASS 18*16, 26*16, 8
HADAMARD_SAD_HPASS 3*16, 11*16, edx
HADAMARD_SAD_HPASS 19*16, 27*16, edx+8
lea eax, [eax+2*edx]
lea ecx, [ecx+2*edx]
HADAMARD_SAD_HPASS 4*16, 12*16, 0
HADAMARD_SAD_HPASS 20*16, 28*16, 8
HADAMARD_SAD_HPASS 5*16, 13*16, edx
HADAMARD_SAD_HPASS 21*16, 29*16, edx+8
lea eax, [eax+2*edx]
lea ecx, [ecx+2*edx]
HADAMARD_SAD_HPASS 6*16, 14*16, 0
HADAMARD_SAD_HPASS 22*16, 30*16, 8
HADAMARD_SAD_HPASS 7*16, 15*16, edx
HADAMARD_SAD_HPASS 23*16, 31*16, edx+8
lea eax, [eax+2*edx]
lea ecx, [ecx+2*edx]
HADAMARD_SAD_HPASS 32*16, 40*16, 0
HADAMARD_SAD_HPASS 48*16, 56*16, 8
HADAMARD_SAD_HPASS 33*16, 41*16, edx
HADAMARD_SAD_HPASS 49*16, 57*16, edx+8
lea eax, [eax+2*edx]
lea ecx, [ecx+2*edx]
HADAMARD_SAD_HPASS 34*16, 42*16, 0
HADAMARD_SAD_HPASS 50*16, 58*16, 8
HADAMARD_SAD_HPASS 35*16, 43*16, edx
HADAMARD_SAD_HPASS 51*16, 59*16, edx+8
lea eax, [eax+2*edx]
lea ecx, [ecx+2*edx]
HADAMARD_SAD_HPASS 36*16, 44*16, 0
HADAMARD_SAD_HPASS 52*16, 60*16, 8
HADAMARD_SAD_HPASS 37*16, 45*16, edx
HADAMARD_SAD_HPASS 53*16, 61*16, edx+8
lea eax, [eax+2*edx]
lea ecx, [ecx+2*edx]
HADAMARD_SAD_HPASS 38*16, 46*16, 0
HADAMARD_SAD_HPASS 54*16, 62*16, 8
HADAMARD_SAD_HPASS 39*16, 47*16, edx
HADAMARD_SAD_HPASS 55*16, 63*16, edx+8
HADAMARD_SAD_VPASS esp+ 0*16, [SAD]
movq [SAD], mm6 ; save intermediate SAD
HADAMARD_SAD_VPASS esp+ 8*16, [SAD]
movq [SAD], mm6
HADAMARD_SAD_VPASS esp+16*16, [SAD]
movq [SAD], mm6
HADAMARD_SAD_VPASS esp+24*16, [SAD]
; we need to split SAD accums in two, because of
; overflow... Store partially collapsed current SAD
pmaddwd mm6, [One]
pxor mm7, mm7
movq [SAD], mm6
movq [SAD2], mm7
HADAMARD_SAD_VPASS esp+32*16, [SAD2]
movq [SAD2], mm6
HADAMARD_SAD_VPASS esp+40*16, [SAD2]
movq [SAD2], mm6
HADAMARD_SAD_VPASS esp+48*16, [SAD2]
movq [SAD2], mm6
HADAMARD_SAD_VPASS esp+56*16, [SAD2]
; mm6 = [SAD2]. Now, collapse it (with [SAD]).
pmaddwd mm6, [One]
paddd mm6, [SAD]
movq mm7, mm6
psrlq mm6, 32
mov esp, ebp
paddd mm7, mm6
pop ebp
movd eax, mm7
ret
%undef LOCAL_TMP_SIZE
%undef SAD
;//////////////////////////////////////////////////////////////////////
-------------- next part --------------
A non-text attachment was scrubbed...
Name: skl_hadamard.c
Type: text/x-c
Size: 5309 bytes
Desc: not available
Url : http://edu.bnhof.de/pipermail/xvid-devel/attachments/20030227/1477908d/skl_hadamard.bin
More information about the XviD-devel
mailing list