[XviD-devel] Xvid 1.2.0 is available!
Andrew Dunstan
a_dunstan at hotmail.com
Tue Dec 2 10:52:38 CET 2008
> I've uploaded a fix for that issue to CVS.> > Regards,> Michael>
I found 1 missed case (quant_h263_intra_sse2), added a check in xvid_bench and found a few small optimizations. Not sure if you want to keep the xmm6/xmm7 xvid_bench check (__rdtsc is probably VS specific) but it's here anyway. I removed the code in plugin_ssim-a.asm because it seems pointless; xmm6 and xmm7 have just been cleared so it makes no sense to splat the low byte.
Index: examples/xvid_bench.c===================================================================RCS file: /xvid/xvidcore/examples/xvid_bench.c,vretrieving revision 1.38diff -u -r1.38 xvid_bench.c--- examples/xvid_bench.c 26 Nov 2008 23:37:28 -0000 1.38+++ examples/xvid_bench.c 2 Dec 2008 08:20:57 -0000@@ -2205,6 +2205,13 @@ int width, height; uint32_t chksum = 0; const char * test_bitstream = 0;+#if defined(WIN32) && defined(ARCH_IS_X86_64)+ DECLARE_ALIGNED_MATRIX(xmm_save, 2, 4, uint64_t, 16);+ // assumes xmm6 and xmm7 won't be falsely preserved by C code+ for(c=0;c<4;c++)+ xmm_save[c] = __rdtsc();+ prime_xmm(xmm_save);+#endif cpu_mask = 0; // default => will use autodectect for(c=1; c<argc; ++c)@@ -2284,6 +2291,20 @@ if (what==-2) test_quant_bug(); +#if defined(WIN32) && defined(ARCH_IS_X86_64)+ get_xmm(xmm_save+4);+ if (memcmp(xmm_save, xmm_save+4, 4*sizeof(int64_t))) {+ printf("\nWIN64 ERROR: XMM6 and XMM7 contents not preserved!\n"+ " XMM6 XMM7\n"+ "Before: %.16I64X%.16I64X %.16I64X%.16I64X\n"+ "After: %.16I64X%.16I64X %.16I64X%.16I64X",+ xmm_save[0],xmm_save[1],xmm_save[2],xmm_save[3],+ xmm_save[4],xmm_save[5],xmm_save[6],xmm_save[7]);+ } else {+ printf("\nWIN64: XMM6 and XMM7 contents preserved correctly.\n");+ }+#endif+ if ((what >= 0 && what <= 6) || what == 10) { printf("\n\n" "NB: If a function isn't optimised for a specific set of intructions,\n"Index: src/bitstream/x86_asm/cbp_sse2.asm===================================================================RCS file: /xvid/xvidcore/src/bitstream/x86_asm/cbp_sse2.asm,vretrieving revision 1.11diff -u -r1.11 cbp_sse2.asm--- src/bitstream/x86_asm/cbp_sse2.asm 1 Dec 2008 14:45:45 -0000 1.11+++ src/bitstream/x86_asm/cbp_sse2.asm 2 Dec 2008 07:29:05 -0000@@ -32,7 +32,7 @@ %macro LOOP_SSE2 2 movdqa xmm0, [%2+(%1)*128]- pand xmm0, xmm7+ pand xmm0, xmm3 movdqa xmm1, [%2+(%1)*128+16] por xmm0, [%2+(%1)*128+32]@@ -43,7 +43,7 @@ por xmm1, [%2+(%1)*128+112] por xmm0, xmm1 ; xmm0 = xmm1 = 128 bits worth of info- psadbw xmm0, xmm6 ; contains 2 dwords with sums+ psadbw xmm0, xmm2 ; contains 2 dwords with sums movhlps xmm1, xmm0 ; move high dword from xmm0 to low xmm1 por xmm0, xmm1 ; combine movd ecx, xmm0 ; if ecx set, values were found@@ -76,10 +76,8 @@ mov _EDX, prm1 ; coeff[] xor _EAX, _EAX ; cbp = 0 - PUSH_XMM6_XMM7- - movdqu xmm7, [ignore_dc] ; mask to ignore dc value- pxor xmm6, xmm6 ; zero+ movdqu xmm3, [ignore_dc] ; mask to ignore dc value+ pxor xmm2, xmm2 ; zero LOOP_SSE2 0, _EDX jz .blk2@@ -112,7 +110,6 @@ .finished: - POP_XMM6_XMM7 ret ENDFUNC Index: src/image/x86_asm/gmc_mmx.asm===================================================================RCS file: /xvid/xvidcore/src/image/x86_asm/gmc_mmx.asm,vretrieving revision 1.8diff -u -r1.8 gmc_mmx.asm--- src/image/x86_asm/gmc_mmx.asm 1 Dec 2008 14:45:45 -0000 1.8+++ src/image/x86_asm/gmc_mmx.asm 2 Dec 2008 07:41:36 -0000@@ -200,8 +200,6 @@ align SECTION_ALIGN xvid_GMC_Core_Lin_8_sse41:- PUSH_XMM6_XMM7- mov _EAX, prm2 ; Offsets mov TMP0, prm3 ; Src0 mov TMP1, prm4 ; BpS@@ -218,7 +216,6 @@ packuswb xmm5, xmm5 movq [_EAX], xmm5 - POP_XMM6_XMM7 ret ENDFUNC Index: src/image/x86_asm/postprocessing_sse2.asm===================================================================RCS file: /xvid/xvidcore/src/image/x86_asm/postprocessing_sse2.asm,vretrieving revision 1.12diff -u -r1.12 postprocessing_sse2.asm--- src/image/x86_asm/postprocessing_sse2.asm 1 Dec 2008 15:00:44 -0000 1.12+++ src/image/x86_asm/postprocessing_sse2.asm 2 Dec 2008 07:45:17 -0000@@ -66,7 +66,6 @@ ALIGN SECTION_ALIGN image_brightness_sse2:- PUSH_XMM6_XMM7 %ifdef ARCH_IS_X86_64 movsx _EAX, prm5d %else@@ -79,14 +78,14 @@ push _EDI ; 8 bytes offset for push sub _ESP, 32 ; 32 bytes for local data (16bytes will be used, 16bytes more to align correctly mod 16) - movdqa xmm6, [xmm_0x80]+ movdqa xmm2, [xmm_0x80] ; Create a offset...offset vector mov _ESI, _ESP ; TMP1 will be esp aligned mod 16 add _ESI, 15 ; TMP1 = esp + 15 and _ESI, ~15 ; TMP1 = (esp + 15)&(~15) CREATE_OFFSET_VECTOR _ESI, al- movdqa xmm7, [_ESI]+ movdqa xmm3, [_ESI] %ifdef ARCH_IS_X86_64 mov _ESI, prm3@@ -103,12 +102,12 @@ movdqa xmm0, [TMP1 + _EAX] movdqa xmm1, [TMP1 + _EAX + 16] ; xmm0 = [dst] - paddb xmm0, xmm6 ; unsigned -> signed domain- paddb xmm1, xmm6- paddsb xmm0, xmm7- paddsb xmm1, xmm7 ; xmm0 += offset- psubb xmm0, xmm6- psubb xmm1, xmm6 ; signed -> unsigned domain+ paddb xmm0, xmm2 ; unsigned -> signed domain+ paddb xmm1, xmm2+ paddsb xmm0, xmm3+ paddsb xmm1, xmm3 ; xmm0 += offset+ psubb xmm0, xmm2+ psubb xmm1, xmm2 ; signed -> unsigned domain movdqa [TMP1 + _EAX], xmm0 movdqa [TMP1 + _EAX + 16], xmm1 ; [dst] = xmm0@@ -125,7 +124,6 @@ pop _EDI pop _ESI - POP_XMM6_XMM7 ret ENDFUNC ;//////////////////////////////////////////////////////////////////////Index: src/motion/x86_asm/sad_sse2.asm===================================================================RCS file: /xvid/xvidcore/src/motion/x86_asm/sad_sse2.asm,vretrieving revision 1.17diff -u -r1.17 sad_sse2.asm--- src/motion/x86_asm/sad_sse2.asm 1 Dec 2008 14:45:45 -0000 1.17+++ src/motion/x86_asm/sad_sse2.asm 2 Dec 2008 07:56:52 -0000@@ -63,18 +63,17 @@ movdqa xmm3, [_EAX+TMP0] lea _EAX,[_EAX+2*TMP0] psadbw xmm0, xmm2- paddusw xmm6,xmm0+ paddusw xmm4,xmm0 psadbw xmm1, xmm3- paddusw xmm6,xmm1+ paddusw xmm4,xmm1 %endmacro %macro SAD16_SSE2_SSE3 1- PUSH_XMM6_XMM7 mov _EAX, prm1 ; cur (assumed aligned) mov TMP1, prm2 ; ref mov TMP0, prm3 ; stride - pxor xmm6, xmm6 ; accum+ pxor xmm4, xmm4 ; accum SAD_16x16_SSE2 %1 SAD_16x16_SSE2 %1@@ -85,11 +84,10 @@ SAD_16x16_SSE2 %1 SAD_16x16_SSE2 %1 - pshufd xmm5, xmm6, 00000010b- paddusw xmm6, xmm5- pextrw eax, xmm6, 0+ pshufd xmm5, xmm4, 00000010b+ paddusw xmm4, xmm5+ pextrw eax, xmm4, 0 - POP_XMM6_XMM7 ret %endmacro @@ -113,20 +111,19 @@ %1 xmm0, [_EAX] %1 xmm1, [_EAX+TMP0] lea _EAX, [_EAX+2*TMP0] ; + 2*stride- psadbw xmm0, xmm7- paddusw xmm6, xmm0- psadbw xmm1, xmm7- paddusw xmm6, xmm1+ psadbw xmm0, xmm5+ paddusw xmm4, xmm0+ psadbw xmm1, xmm5+ paddusw xmm4, xmm1 %endmacro %macro MEAN16_SSE2_SSE3 1- PUSH_XMM6_XMM7 mov _EAX, prm1 ; src mov TMP0, prm2 ; stride - pxor xmm6, xmm6 ; accum- pxor xmm7, xmm7 ; zero+ pxor xmm4, xmm4 ; accum+ pxor xmm5, xmm5 ; zero MEAN_16x16_SSE2 %1 MEAN_16x16_SSE2 %1@@ -140,13 +137,13 @@ mov _EAX, prm1 ; src again - pshufd xmm7, xmm6, 10b- paddusw xmm7, xmm6- pxor xmm6, xmm6 ; zero accum- psrlw xmm7, 8 ; => Mean- pshuflw xmm7, xmm7, 0 ; replicate Mean- packuswb xmm7, xmm7- pshufd xmm7, xmm7, 00000000b+ pshufd xmm5, xmm4, 10b+ paddusw xmm5, xmm4+ pxor xmm4, xmm4 ; zero accum+ psrlw xmm5, 8 ; => Mean+ pshuflw xmm5, xmm5, 0 ; replicate Mean+ packuswb xmm5, xmm5+ pshufd xmm5, xmm5, 00000000b MEAN_16x16_SSE2 %1 MEAN_16x16_SSE2 %1@@ -158,11 +155,10 @@ MEAN_16x16_SSE2 %1 MEAN_16x16_SSE2 %1 - pshufd xmm7, xmm6, 10b- paddusw xmm7, xmm6- pextrw eax, xmm7, 0+ pshufd xmm5, xmm4, 10b+ paddusw xmm5, xmm4+ pextrw eax, xmm5, 0 - POP_XMM6_XMM7 ret %endmacro Index: src/plugins/x86_asm/plugin_ssim-a.asm===================================================================RCS file: /xvid/xvidcore/src/plugins/x86_asm/plugin_ssim-a.asm,vretrieving revision 1.10diff -u -r1.10 plugin_ssim-a.asm--- src/plugins/x86_asm/plugin_ssim-a.asm 1 Dec 2008 14:45:46 -0000 1.10+++ src/plugins/x86_asm/plugin_ssim-a.asm 2 Dec 2008 07:53:45 -0000@@ -169,14 +169,6 @@ pxor xmm6,xmm6;devc pxor xmm7,xmm7;corr - ;broadcast lumo/c- punpcklbw xmm6,xmm6- punpcklwd xmm6,xmm6- pshufd xmm6,xmm6,00000000b;or shufps- punpcklbw xmm7,xmm7- punpcklwd xmm7,xmm7- pshufd xmm7,xmm7,00000000b- CONSIM_1x8_SSE2 add TMP0,_EAX add TMP1,_EAXIndex: src/quant/x86_asm/quantize_h263_mmx.asm===================================================================RCS file: /xvid/xvidcore/src/quant/x86_asm/quantize_h263_mmx.asm,vretrieving revision 1.12diff -u -r1.12 quantize_h263_mmx.asm--- src/quant/x86_asm/quantize_h263_mmx.asm 1 Dec 2008 14:45:46 -0000 1.12+++ src/quant/x86_asm/quantize_h263_mmx.asm 2 Dec 2008 08:34:44 -0000@@ -247,7 +247,7 @@ ALIGN SECTION_ALIGN quant_h263_intra_sse2:-+ PUSH_XMM6_XMM7 mov _EAX, prm2 ; data movsx _EAX, word [_EAX] ; data[0]@@ -363,7 +363,7 @@ mov TMP1, prm1 ; coeff mov [TMP1],ax xor _EAX,_EAX ; return 0-+ POP_XMM6_XMM7 ret ENDFUNC @@ -491,8 +491,7 @@ pxor xmm5, xmm5 ; sum lea TMP0, [mmx_sub]- movq mm0, [TMP0 + _EAX*8 - 8] ; sub- movq2dq xmm6, mm0 ; load into low 8 bytes+ movq xmm6, [TMP0 + _EAX*8 - 8] ; sub movlhps xmm6, xmm6 ; duplicate into high 8 bytes cmp al, 1@@ -500,12 +499,11 @@ .qes2_not1: lea TMP0, [mmx_div]- movq mm0, [TMP0 + _EAX*8 - 8] ; divider+ movq xmm7, [TMP0 + _EAX*8 - 8] ; divider xor TMP0, TMP0 mov _EAX, prm2 ; data - movq2dq xmm7, mm0 movlhps xmm7, xmm7 ALIGN SECTION_ALIGN@@ -538,7 +536,7 @@ jnz .qes2_loop .qes2_done:- movdqu xmm6, [plus_one]+ movdqa xmm6, [plus_one] pmaddwd xmm5, xmm6 movhlps xmm6, xmm5 paddd xmm5, xmm6Index: src/utils/emms.h===================================================================RCS file: /xvid/xvidcore/src/utils/emms.h,vretrieving revision 1.16diff -u -r1.16 emms.h--- src/utils/emms.h 5 Jan 2005 23:02:15 -0000 1.16+++ src/utils/emms.h 2 Dec 2008 08:17:53 -0000@@ -57,6 +57,11 @@ extern void sse2_os_trigger(void); #endif +#if defined(ARCH_IS_X86_64) && defined(WIN32)+extern void prime_xmm(void*);+extern void get_xmm(void*);+#endif+ #ifdef ARCH_IS_PPC extern void altivec_trigger(void); #endifIndex: src/utils/x86_asm/cpuid.asm===================================================================RCS file: /xvid/xvidcore/src/utils/x86_asm/cpuid.asm,vretrieving revision 1.15diff -u -r1.15 cpuid.asm--- src/utils/x86_asm/cpuid.asm 26 Nov 2008 01:04:34 -0000 1.15+++ src/utils/x86_asm/cpuid.asm 2 Dec 2008 08:17:36 -0000@@ -221,6 +221,22 @@ ret ENDFUNC +%ifdef ARCH_IS_X86_64+%ifdef WINDOWS+cglobal prime_xmm+prime_xmm:+ movdqa xmm6, [prm1]+ movdqa xmm7, [prm1+16]+ ret+ENDFUNC++cglobal get_xmm+get_xmm:+ movdqa [prm1], xmm6+ movdqa [prm1+16], xmm7+ ret+%endif+%endif %ifidn __OUTPUT_FORMAT__,elf
More information about the Xvid-devel
mailing list