[XviD-devel] Re: xvid_fdct_sse + qpel ASM
skal
xvid-devel@xvid.org
20 Jan 2003 16:17:56 +0100
--=-ag4dD7pukXRGF7iP/4Fh
Content-Type: text/plain
Content-Transfer-Encoding: 7bit
Re,
Ok, ok, i was curious, so I've quickly hacked
the MMX 'punpck' soup mentioned in my previous
mail. Here's what's to add in fdct_sse.asm
for the MMX fdct. It's 10% slower than SSE,
(but still 30% faster than current XVID one,
it seems;)
bye!
Skal
--=-ag4dD7pukXRGF7iP/4Fh
Content-Disposition: attachment; filename=fdct_mmx.asm
Content-Transfer-Encoding: quoted-printable
Content-Type: text/plain; name=fdct_mmx.asm; charset=ISO-8859-1
cglobal xvid_fdct_mmx
;//////////////////////////////////////////////////////////////////////
;// fMTX_MULT_MMX (~26c)
;//////////////////////////////////////////////////////////////////////
%macro fMTX_MULT_MMX 5 ; %1=3Dsrc, %2 =3D Coeffs, %3/%4=3Drounders
; MMX-only version (no 'pshufw'. ~10% overall slower than SSE)
movd mm1, [ecx+%1*16+8+4] ; [67..]
movq mm0, [ecx+%1*16+0] ; mm0 =3D [0123]
movq mm7, mm0
punpcklwd mm1, [ecx+%1*16+8] ; [6475]
movq mm2, mm1
psrlq mm1, 32 ; [75..]
punpcklwd mm1,mm2 ; [7654]
paddsw mm0, mm1 ; mm0 =3D [a0 a1 a2 a3]
psubsw mm7, mm1 ; mm7 =3D [b0 b1 b2 b3]
movq mm1, mm0
punpckldq mm0, mm7 ; mm0 =3D [a0 a1 b0 b1]
punpckhdq mm1, mm7 ; mm1 =3D [b2 b3 a2 a3]
movq mm2, qword [%2+ 0] ; [ M00 M01 M16 M17]
movq mm3, qword [%2+ 8] ; [ M02 M03 M18 M19]
pmaddwd mm2, mm0 ; [a0.M00+a1.M01 | b0.M16+b1.M17]
movq mm4, qword [%2+16] ; [ M04 M05 M20 M21]
pmaddwd mm3, mm1 ; [a2.M02+a3.M03 | b2.M18+b3.M19]
movq mm5, qword [%2+24] ; [ M06 M07 M22 M23]
pmaddwd mm4, mm0 ; [a0.M04+a1.M05 | b0.M20+b1.M21]
movq mm6, qword [%2+32] ; [ M08 M09 M24 M25]
pmaddwd mm5, mm1 ; [a2.M06+a3.M07 | b2.M22+b3.M23]
movq mm7, qword [%2+40] ; [ M10 M11 M26 M27]
pmaddwd mm6, mm0 ; [a0.M08+a1.M09 | b0.M24+b1.M25]
paddd mm2, mm3 ; [ out0 | out1 ]
pmaddwd mm7, mm1 ; [a0.M10+a1.M11 | b0.M26+b1.M27]
psrad mm2, 16
pmaddwd mm0, qword [%2+48] ; [a0.M12+a1.M13 | b0.M28+b1.M29]
paddd mm4, mm5 ; [ out2 | out3 ]
pmaddwd mm1, qword [%2+56] ; [a0.M14+a1.M15 | b0.M30+b1.M31]
psrad mm4, 16
paddd mm6, mm7 ; [ out4 | out5 ]
psrad mm6, 16
paddd mm0, mm1 ; [ out6 | out7 ] =20
psrad mm0, 16
=20
packssdw mm2, mm4 ; [ out0|out1|out2|out3 ]
paddsw mm2, [%3] ; Round
packssdw mm6, mm0 ; [ out4|out5|out6|out7 ]
paddsw mm6, [%4] ; Round
psraw mm2, 4 ; =3D> [-2048, 2047]
psraw mm6, 4
movq [ecx+%1*16+0], mm2
movq [ecx+%1*16+8], mm6
%endmacro
align 16
xvid_fdct_mmx: ; ~269c
mov ecx, [esp+4]
fLLM_PASS ecx+0, 3
fLLM_PASS ecx+8, 3
fMTX_MULT_MMX 0, fTab1, Fdct_Rnd0, Fdct_Rnd0
fMTX_MULT_MMX 1, fTab2, Fdct_Rnd2, Fdct_Rnd1
fMTX_MULT_MMX 2, fTab3, Fdct_Rnd1, Fdct_Rnd1
fMTX_MULT_MMX 3, fTab4, Fdct_Rnd1, Fdct_Rnd1
fMTX_MULT_MMX 4, fTab1, Fdct_Rnd0, Fdct_Rnd0
fMTX_MULT_MMX 5, fTab4, Fdct_Rnd1, Fdct_Rnd1
fMTX_MULT_MMX 6, fTab3, Fdct_Rnd1, Fdct_Rnd1
fMTX_MULT_MMX 7, fTab2, Fdct_Rnd1, Fdct_Rnd1
ret
;//////////////////////////////////////////////////////////////////////
--=-ag4dD7pukXRGF7iP/4Fh--