[XviD-devel] Adding SSE2 asm codes for color space transforming funcion
Xwen.Kong
konsunwin at gmail.com
Fri Jun 19 08:46:14 CEST 2009
Hi,all Functions for color space transforming only use MMX
or XMM asm, such as yv12_to_yuyv = yv12_to_yuyv_mmx in xvid.c,I add SSE2
codes for this one in xvidcore-1.2.1 as follows :
1. add "yv12_to_yuyv = yv12_to_yuyv_sse2;" after "if
((cpu_flags & XVID_CPU_SSE2)) {" in xvid.c .
2. add declaration "packedFunc yv12_to_yuyv_sse2;" after
"#if defined(ARCH_IS_IA32) || defined(ARCH_IS_X86_64)" in
image/image_h/colorspace.h .
3. add code as follows in
image/image_asm/colorspace_yuyv_mmx.asm before "%ifidn
__OUTPUT_FORMAT__,elf" .
cglobal yv12_to_yuyv_sse2
ALIGN SECTION_ALIGN
yv12_to_yuyv_sse2:
push esi
push edi
mov edi,[esp + 8 + 4];yuyv
mov esi,[esp + 8 + 12];y
mov ecx,[esp + 8 + 16];u
mov edx,[esp + 8 + 20];v
mov eax,[esp + 8 + 32]; width
mov ebx,[esp + 8 + 36];height
loop_height:
push ebx;
mov ebx,[esp + 8 + 24 + 4] ; y_stride pushing one before get ebx
loop_width:
push eax;
movdqa xmm1,[esi]
mov eax,[esp + 8 + 8 + 8] ; x_stride pushing two before getting eax
movdqa xmm5,[esi + ebx];next y_stride
movdqa xmm6,xmm1
movdqa xmm7,xmm5
movdqa xmm2,[ecx]; u
movdqa xmm3,[edx];v
movdqa xmm4, xmm2;u copy
punpckhbw xmm4,xmm3;for high_bits using
punpcklbw xmm2,xmm3; use it,xmm3 is no longer used
movdqa xmm0,[esi + 16]; next loop
movdqa xmm3,[esi + ebx + 16];
punpcklbw xmm1,xmm2
movlps [edi],xmm1 ;Application_layer doesn't align os_stream
movhps [edi + 8],xmm1
punpckhbw xmm6,xmm2
movlps [edi + 16],xmm6
movhps [edi + 24],xmm6
punpcklbw xmm5,xmm2
movlps [edi + eax],xmm5 ;;next_stride
movhps [edi + eax + 8],xmm5
punpckhbw xmm7,xmm2
movlps [edi + eax + 16],xmm7
movhps [edi +eax + 24],xmm7
movdqa xmm1,xmm0
movdqa xmm2,xmm3
punpcklbw xmm0,xmm4 ;shuffled
movlps [edi + 32],xmm0 ; movlps + movhps are faster than one movdqu :)
movhps [edi + 40],xmm0
punpckhbw xmm1,xmm4
movlps [edi + 48],xmm1
movhps [edi + 56],xmm1
punpcklbw xmm2,xmm4
movlps [edi + eax + 32],xmm2
movhps [edi + eax + 40],xmm2
punpckhbw xmm3,xmm4
movlps [edi + eax + 48],xmm3
movhps [edi + eax + 56],xmm3
lea edi,[edi + 64] ; x_ptr
lea esi,[esi + 32] ; y_ptr
lea ecx,[ecx + 16] ; u_ptr
lea edx,[edx + 16] ; v_ptr
pop eax
sub eax,32
jnz loop_width ;inter_loop
mov eax,[esp + 8 + 8 + 4];get x_stride again
lea edi,[edi + eax + 0]
lea esi,[esi + ebx + 128]
lea ecx,[ecx + 64]
lea edx,[edx + 64]
mov eax,[esp + 8 + 32 + 4] ;width
pop ebx
sub ebx,2
jnz loop_height ;outer_loop
pop edi
pop esi
ret
.endfunc
More information about the Xvid-devel
mailing list