[XviD-devel] Adding SSE2 asm codes for color space transforming funcion

Xwen.Kong konsunwin at gmail.com
Fri Jun 19 08:46:14 CEST 2009


   Hi,all             Functions for  color space transforming  only use MMX
or XMM asm, such as  yv12_to_yuyv  = yv12_to_yuyv_mmx  in xvid.c,I add SSE2
codes for this one in xvidcore-1.2.1  as follows :

               1. add "yv12_to_yuyv    = yv12_to_yuyv_sse2;"  after "if
((cpu_flags & XVID_CPU_SSE2)) {"   in xvid.c .

               2. add declaration  "packedFunc yv12_to_yuyv_sse2;"  after
"#if defined(ARCH_IS_IA32) || defined(ARCH_IS_X86_64)" in
image/image_h/colorspace.h .

               3. add code as follows in
 image/image_asm/colorspace_yuyv_mmx.asm before   "%ifidn
__OUTPUT_FORMAT__,elf" .

cglobal  yv12_to_yuyv_sse2
ALIGN SECTION_ALIGN
yv12_to_yuyv_sse2:
  push esi
  push edi

          mov edi,[esp + 8 + 4];yuyv
  mov esi,[esp + 8 + 12];y
  mov ecx,[esp + 8 + 16];u
  mov edx,[esp + 8 + 20];v
          mov eax,[esp + 8 + 32]; width
  mov ebx,[esp + 8 + 36];height
loop_height:
push ebx;
mov ebx,[esp + 8 + 24 + 4] ; y_stride  pushing one before get ebx
loop_width:
push eax;

movdqa xmm1,[esi]
mov eax,[esp + 8 + 8 + 8]  ; x_stride  pushing two before getting eax
movdqa xmm5,[esi + ebx];next y_stride

movdqa xmm6,xmm1
movdqa xmm7,xmm5

movdqa xmm2,[ecx]; u
movdqa xmm3,[edx];v

movdqa xmm4, xmm2;u copy
punpckhbw xmm4,xmm3;for high_bits using

punpcklbw xmm2,xmm3; use it,xmm3 is no longer used

movdqa xmm0,[esi + 16]; next loop
movdqa xmm3,[esi + ebx + 16];



punpcklbw xmm1,xmm2
movlps [edi],xmm1                ;Application_layer doesn't align os_stream
movhps [edi + 8],xmm1
punpckhbw xmm6,xmm2
movlps [edi + 16],xmm6
movhps [edi + 24],xmm6
punpcklbw xmm5,xmm2
movlps [edi + eax],xmm5 ;;next_stride
movhps [edi + eax + 8],xmm5
punpckhbw xmm7,xmm2
movlps [edi + eax + 16],xmm7
movhps [edi +eax + 24],xmm7


movdqa xmm1,xmm0
movdqa xmm2,xmm3

punpcklbw xmm0,xmm4  ;shuffled
movlps [edi + 32],xmm0   ;  movlps + movhps are faster than one movdqu :)
movhps [edi + 40],xmm0
punpckhbw xmm1,xmm4
movlps [edi + 48],xmm1
movhps [edi + 56],xmm1
punpcklbw xmm2,xmm4
movlps [edi + eax + 32],xmm2
movhps [edi + eax + 40],xmm2
punpckhbw xmm3,xmm4
movlps [edi + eax + 48],xmm3
movhps [edi + eax + 56],xmm3


lea edi,[edi + 64]  ; x_ptr
lea esi,[esi + 32]  ; y_ptr
lea ecx,[ecx + 16]  ; u_ptr
lea edx,[edx + 16]  ; v_ptr

pop eax
sub eax,32
jnz loop_width     ;inter_loop

mov eax,[esp + 8 + 8 + 4];get x_stride again

lea edi,[edi + eax + 0]
lea esi,[esi + ebx + 128]
lea ecx,[ecx + 64]
lea edx,[edx + 64]

mov eax,[esp + 8 + 32 + 4] ;width
pop ebx
sub ebx,2
jnz loop_height ;outer_loop
pop edi
pop esi
 ret
.endfunc


More information about the Xvid-devel mailing list