[XviD-devel] Xvid 1.2.0 is available!

Tue Dec 2 10:52:38 CET 2008

> I've uploaded a fix for that issue to CVS.> > Regards,> Michael> 
I found 1 missed case (quant_h263_intra_sse2), added a check in xvid_bench and found a few small optimizations. Not sure if you want to keep the xmm6/xmm7 xvid_bench check (__rdtsc is probably VS specific) but it's here anyway. I removed the code in plugin_ssim-a.asm because it seems pointless; xmm6 and xmm7 have just been cleared so it makes no sense to splat the low byte.
 
 
Index: examples/xvid_bench.c===================================================================RCS file: /xvid/xvidcore/examples/xvid_bench.c,vretrieving revision 1.38diff -u -r1.38 xvid_bench.c--- examples/xvid_bench.c 26 Nov 2008 23:37:28 -0000 1.38+++ examples/xvid_bench.c 2 Dec 2008 08:20:57 -0000@@ -2205,6 +2205,13 @@  int width, height;  uint32_t chksum = 0;  const char * test_bitstream = 0;+#if defined(WIN32) && defined(ARCH_IS_X86_64)+ DECLARE_ALIGNED_MATRIX(xmm_save, 2, 4, uint64_t, 16);+ // assumes xmm6 and xmm7 won't be falsely preserved by C code+ for(c=0;c<4;c++)+  xmm_save[c] = __rdtsc();+ prime_xmm(xmm_save);+#endif   cpu_mask = 0;  // default => will use autodectect  for(c=1; c<argc; ++c)@@ -2284,6 +2291,20 @@  if (what==-2)   test_quant_bug(); +#if defined(WIN32) && defined(ARCH_IS_X86_64)+ get_xmm(xmm_save+4);+ if (memcmp(xmm_save, xmm_save+4, 4*sizeof(int64_t))) {+  printf("\nWIN64 ERROR: XMM6 and XMM7 contents not preserved!\n"+   "        XMM6                             XMM7\n"+   "Before: %.16I64X%.16I64X %.16I64X%.16I64X\n"+   "After:  %.16I64X%.16I64X %.16I64X%.16I64X",+   xmm_save[0],xmm_save[1],xmm_save[2],xmm_save[3],+   xmm_save[4],xmm_save[5],xmm_save[6],xmm_save[7]);+ } else {+  printf("\nWIN64: XMM6 and XMM7 contents preserved correctly.\n");+ }+#endif+  if ((what >= 0 && what <= 6) || what == 10) {   printf("\n\n"       "NB: If a function isn't optimised for a specific set of intructions,\n"Index: src/bitstream/x86_asm/cbp_sse2.asm===================================================================RCS file: /xvid/xvidcore/src/bitstream/x86_asm/cbp_sse2.asm,vretrieving revision 1.11diff -u -r1.11 cbp_sse2.asm--- src/bitstream/x86_asm/cbp_sse2.asm 1 Dec 2008 14:45:45 -0000 1.11+++ src/bitstream/x86_asm/cbp_sse2.asm 2 Dec 2008 07:29:05 -0000@@ -32,7 +32,7 @@  %macro LOOP_SSE2 2   movdqa xmm0, [%2+(%1)*128]-  pand xmm0, xmm7+  pand xmm0, xmm3   movdqa xmm1, [%2+(%1)*128+16]    por xmm0, [%2+(%1)*128+32]@@ -43,7 +43,7 @@   por xmm1, [%2+(%1)*128+112]    por xmm0, xmm1        ; xmm0 = xmm1 = 128 bits worth of info-  psadbw xmm0, xmm6     ; contains 2 dwords with sums+  psadbw xmm0, xmm2     ; contains 2 dwords with sums   movhlps xmm1, xmm0    ; move high dword from xmm0 to low xmm1   por xmm0, xmm1        ; combine   movd ecx, xmm0        ; if ecx set, values were found@@ -76,10 +76,8 @@   mov _EDX, prm1           ; coeff[]   xor _EAX, _EAX           ; cbp = 0 -  PUSH_XMM6_XMM7-  -  movdqu xmm7, [ignore_dc] ; mask to ignore dc value-  pxor xmm6, xmm6          ; zero+  movdqu xmm3, [ignore_dc] ; mask to ignore dc value+  pxor xmm2, xmm2          ; zero    LOOP_SSE2 0, _EDX   jz .blk2@@ -112,7 +110,6 @@  .finished:  -  POP_XMM6_XMM7   ret ENDFUNC Index: src/image/x86_asm/gmc_mmx.asm===================================================================RCS file: /xvid/xvidcore/src/image/x86_asm/gmc_mmx.asm,vretrieving revision 1.8diff -u -r1.8 gmc_mmx.asm--- src/image/x86_asm/gmc_mmx.asm 1 Dec 2008 14:45:45 -0000 1.8+++ src/image/x86_asm/gmc_mmx.asm 2 Dec 2008 07:41:36 -0000@@ -200,8 +200,6 @@  align SECTION_ALIGN xvid_GMC_Core_Lin_8_sse41:-  PUSH_XMM6_XMM7-     mov  _EAX, prm2  ; Offsets   mov  TMP0, prm3  ; Src0   mov  TMP1, prm4  ; BpS@@ -218,7 +216,6 @@   packuswb xmm5, xmm5   movq [_EAX], xmm5 -  POP_XMM6_XMM7   ret ENDFUNC Index: src/image/x86_asm/postprocessing_sse2.asm===================================================================RCS file: /xvid/xvidcore/src/image/x86_asm/postprocessing_sse2.asm,vretrieving revision 1.12diff -u -r1.12 postprocessing_sse2.asm--- src/image/x86_asm/postprocessing_sse2.asm 1 Dec 2008 15:00:44 -0000 1.12+++ src/image/x86_asm/postprocessing_sse2.asm 2 Dec 2008 07:45:17 -0000@@ -66,7 +66,6 @@  ALIGN SECTION_ALIGN image_brightness_sse2:-  PUSH_XMM6_XMM7 %ifdef ARCH_IS_X86_64   movsx _EAX, prm5d %else@@ -79,14 +78,14 @@   push _EDI    ; 8 bytes offset for push   sub _ESP, 32 ; 32 bytes for local data (16bytes will be used, 16bytes more to align correctly mod 16) -  movdqa xmm6, [xmm_0x80]+  movdqa xmm2, [xmm_0x80]    ; Create a offset...offset vector   mov _ESI, _ESP          ; TMP1 will be esp aligned mod 16   add _ESI, 15            ; TMP1 = esp + 15   and _ESI, ~15           ; TMP1 = (esp + 15)&(~15)   CREATE_OFFSET_VECTOR _ESI, al-  movdqa xmm7, [_ESI]+  movdqa xmm3, [_ESI]  %ifdef ARCH_IS_X86_64   mov _ESI, prm3@@ -103,12 +102,12 @@   movdqa xmm0, [TMP1 + _EAX]   movdqa xmm1, [TMP1 + _EAX + 16] ; xmm0 = [dst] -  paddb xmm0, xmm6              ; unsigned -> signed domain-  paddb xmm1, xmm6-  paddsb xmm0, xmm7-  paddsb xmm1, xmm7             ; xmm0 += offset-  psubb xmm0, xmm6-  psubb xmm1, xmm6              ; signed -> unsigned domain+  paddb xmm0, xmm2              ; unsigned -> signed domain+  paddb xmm1, xmm2+  paddsb xmm0, xmm3+  paddsb xmm1, xmm3             ; xmm0 += offset+  psubb xmm0, xmm2+  psubb xmm1, xmm2              ; signed -> unsigned domain    movdqa [TMP1 + _EAX], xmm0   movdqa [TMP1 + _EAX + 16], xmm1 ; [dst] = xmm0@@ -125,7 +124,6 @@   pop _EDI   pop _ESI -  POP_XMM6_XMM7   ret ENDFUNC ;//////////////////////////////////////////////////////////////////////Index: src/motion/x86_asm/sad_sse2.asm===================================================================RCS file: /xvid/xvidcore/src/motion/x86_asm/sad_sse2.asm,vretrieving revision 1.17diff -u -r1.17 sad_sse2.asm--- src/motion/x86_asm/sad_sse2.asm 1 Dec 2008 14:45:45 -0000 1.17+++ src/motion/x86_asm/sad_sse2.asm 2 Dec 2008 07:56:52 -0000@@ -63,18 +63,17 @@   movdqa  xmm3, [_EAX+TMP0]   lea _EAX,[_EAX+2*TMP0]   psadbw  xmm0, xmm2-  paddusw xmm6,xmm0+  paddusw xmm4,xmm0   psadbw  xmm1, xmm3-  paddusw xmm6,xmm1+  paddusw xmm4,xmm1 %endmacro  %macro SAD16_SSE2_SSE3 1-  PUSH_XMM6_XMM7   mov _EAX, prm1 ; cur (assumed aligned)   mov TMP1, prm2 ; ref   mov TMP0, prm3 ; stride -  pxor xmm6, xmm6 ; accum+  pxor xmm4, xmm4 ; accum    SAD_16x16_SSE2 %1   SAD_16x16_SSE2 %1@@ -85,11 +84,10 @@   SAD_16x16_SSE2 %1   SAD_16x16_SSE2 %1 -  pshufd  xmm5, xmm6, 00000010b-  paddusw xmm6, xmm5-  pextrw  eax, xmm6, 0+  pshufd  xmm5, xmm4, 00000010b+  paddusw xmm4, xmm5+  pextrw  eax, xmm4, 0 -  POP_XMM6_XMM7   ret %endmacro @@ -113,20 +111,19 @@   %1 xmm0, [_EAX]   %1 xmm1, [_EAX+TMP0]   lea _EAX, [_EAX+2*TMP0]    ; + 2*stride-  psadbw xmm0, xmm7-  paddusw xmm6, xmm0-  psadbw xmm1, xmm7-  paddusw xmm6, xmm1+  psadbw xmm0, xmm5+  paddusw xmm4, xmm0+  psadbw xmm1, xmm5+  paddusw xmm4, xmm1 %endmacro   %macro MEAN16_SSE2_SSE3 1-  PUSH_XMM6_XMM7   mov _EAX, prm1   ; src   mov TMP0, prm2   ; stride -  pxor xmm6, xmm6     ; accum-  pxor xmm7, xmm7     ; zero+  pxor xmm4, xmm4     ; accum+  pxor xmm5, xmm5     ; zero    MEAN_16x16_SSE2 %1   MEAN_16x16_SSE2 %1@@ -140,13 +137,13 @@    mov _EAX, prm1       ; src again -  pshufd   xmm7, xmm6, 10b-  paddusw  xmm7, xmm6-  pxor     xmm6, xmm6     ; zero accum-  psrlw    xmm7, 8        ; => Mean-  pshuflw  xmm7, xmm7, 0  ; replicate Mean-  packuswb xmm7, xmm7-  pshufd   xmm7, xmm7, 00000000b+  pshufd   xmm5, xmm4, 10b+  paddusw  xmm5, xmm4+  pxor     xmm4, xmm4     ; zero accum+  psrlw    xmm5, 8        ; => Mean+  pshuflw  xmm5, xmm5, 0  ; replicate Mean+  packuswb xmm5, xmm5+  pshufd   xmm5, xmm5, 00000000b    MEAN_16x16_SSE2 %1   MEAN_16x16_SSE2 %1@@ -158,11 +155,10 @@   MEAN_16x16_SSE2 %1   MEAN_16x16_SSE2 %1 -  pshufd   xmm7, xmm6, 10b-  paddusw  xmm7, xmm6-  pextrw eax, xmm7, 0+  pshufd   xmm5, xmm4, 10b+  paddusw  xmm5, xmm4+  pextrw eax, xmm5, 0 -  POP_XMM6_XMM7   ret %endmacro Index: src/plugins/x86_asm/plugin_ssim-a.asm===================================================================RCS file: /xvid/xvidcore/src/plugins/x86_asm/plugin_ssim-a.asm,vretrieving revision 1.10diff -u -r1.10 plugin_ssim-a.asm--- src/plugins/x86_asm/plugin_ssim-a.asm 1 Dec 2008 14:45:46 -0000 1.10+++ src/plugins/x86_asm/plugin_ssim-a.asm 2 Dec 2008 07:53:45 -0000@@ -169,14 +169,6 @@  pxor xmm6,xmm6;devc  pxor xmm7,xmm7;corr - ;broadcast lumo/c- punpcklbw xmm6,xmm6- punpcklwd xmm6,xmm6- pshufd xmm6,xmm6,00000000b;or shufps- punpcklbw xmm7,xmm7- punpcklwd xmm7,xmm7- pshufd xmm7,xmm7,00000000b-  CONSIM_1x8_SSE2  add TMP0,_EAX  add TMP1,_EAXIndex: src/quant/x86_asm/quantize_h263_mmx.asm===================================================================RCS file: /xvid/xvidcore/src/quant/x86_asm/quantize_h263_mmx.asm,vretrieving revision 1.12diff -u -r1.12 quantize_h263_mmx.asm--- src/quant/x86_asm/quantize_h263_mmx.asm 1 Dec 2008 14:45:46 -0000 1.12+++ src/quant/x86_asm/quantize_h263_mmx.asm 2 Dec 2008 08:34:44 -0000@@ -247,7 +247,7 @@  ALIGN SECTION_ALIGN quant_h263_intra_sse2:-+  PUSH_XMM6_XMM7   mov _EAX, prm2     ; data     movsx _EAX, word [_EAX]      ; data[0]@@ -363,7 +363,7 @@   mov TMP1, prm1     ; coeff   mov [TMP1],ax     xor _EAX,_EAX            ; return 0-+  POP_XMM6_XMM7   ret ENDFUNC  @@ -491,8 +491,7 @@   pxor xmm5, xmm5                           ; sum    lea TMP0, [mmx_sub]-  movq mm0, [TMP0 + _EAX*8 - 8]             ; sub-  movq2dq xmm6, mm0                         ; load into low 8 bytes+  movq xmm6, [TMP0 + _EAX*8 - 8]             ; sub   movlhps xmm6, xmm6                        ; duplicate into high 8 bytes    cmp al, 1@@ -500,12 +499,11 @@  .qes2_not1:   lea TMP0, [mmx_div]-  movq mm0, [TMP0 + _EAX*8 - 8]          ; divider+  movq xmm7, [TMP0 + _EAX*8 - 8]          ; divider    xor TMP0, TMP0   mov _EAX, prm2      ; data -  movq2dq xmm7, mm0   movlhps xmm7, xmm7  ALIGN SECTION_ALIGN@@ -538,7 +536,7 @@   jnz .qes2_loop  .qes2_done:-  movdqu xmm6, [plus_one]+  movdqa xmm6, [plus_one]   pmaddwd xmm5, xmm6   movhlps xmm6, xmm5   paddd xmm5, xmm6Index: src/utils/emms.h===================================================================RCS file: /xvid/xvidcore/src/utils/emms.h,vretrieving revision 1.16diff -u -r1.16 emms.h--- src/utils/emms.h 5 Jan 2005 23:02:15 -0000 1.16+++ src/utils/emms.h 2 Dec 2008 08:17:53 -0000@@ -57,6 +57,11 @@ extern void sse2_os_trigger(void); #endif +#if defined(ARCH_IS_X86_64) && defined(WIN32)+extern void prime_xmm(void*);+extern void get_xmm(void*);+#endif+ #ifdef ARCH_IS_PPC extern void altivec_trigger(void); #endifIndex: src/utils/x86_asm/cpuid.asm===================================================================RCS file: /xvid/xvidcore/src/utils/x86_asm/cpuid.asm,vretrieving revision 1.15diff -u -r1.15 cpuid.asm--- src/utils/x86_asm/cpuid.asm 26 Nov 2008 01:04:34 -0000 1.15+++ src/utils/x86_asm/cpuid.asm 2 Dec 2008 08:17:36 -0000@@ -221,6 +221,22 @@   ret ENDFUNC +%ifdef ARCH_IS_X86_64+%ifdef WINDOWS+cglobal prime_xmm+prime_xmm:+  movdqa xmm6, [prm1]+  movdqa xmm7, [prm1+16]+  ret+ENDFUNC++cglobal get_xmm+get_xmm:+  movdqa [prm1], xmm6+  movdqa [prm1+16], xmm7+  ret+%endif+%endif   %ifidn __OUTPUT_FORMAT__,elf