[XviD-devel] Xvid 1.2.0 is available!

Andrew Dunstan a_dunstan at hotmail.com
Tue Dec 2 11:17:47 CET 2008


*kicks mail client*

Let's try that again.

Index: examples/xvid_bench.c
===================================================================
RCS file: /xvid/xvidcore/examples/xvid_bench.c,v
retrieving revision 1.38
diff -u -r1.38 xvid_bench.c
--- examples/xvid_bench.c 26 Nov 2008 23:37:28 -0000 1.38
+++ examples/xvid_bench.c 2 Dec 2008 08:20:57 -0000
@@ -2205,6 +2205,13 @@
  int width, height;
  uint32_t chksum = 0;
  const char * test_bitstream = 0;
+#if defined(WIN32) && defined(ARCH_IS_X86_64)
+ DECLARE_ALIGNED_MATRIX(xmm_save, 2, 4, uint64_t, 16);
+ // assumes xmm6 and xmm7 won't be falsely preserved by C code
+ for(c=0;c<4;c++)
+  xmm_save[c] = __rdtsc();
+ prime_xmm(xmm_save);
+#endif

  cpu_mask = 0;  // default => will use autodectect
  for(c=1; c<argc; ++c)
@@ -2284,6 +2291,20 @@
  if (what==-2)
   test_quant_bug();

+#if defined(WIN32) && defined(ARCH_IS_X86_64)
+ get_xmm(xmm_save+4);
+ if (memcmp(xmm_save, xmm_save+4, 4*sizeof(int64_t))) {
+  printf("\nWIN64 ERROR: XMM6 and XMM7 contents not preserved!\n"
+   "        XMM6                             XMM7\n"
+   "Before: %.16I64X%.16I64X %.16I64X%.16I64X\n"
+   "After:  %.16I64X%.16I64X %.16I64X%.16I64X",
+   xmm_save[0],xmm_save[1],xmm_save[2],xmm_save[3],
+   xmm_save[4],xmm_save[5],xmm_save[6],xmm_save[7]);
+ } else {
+  printf("\nWIN64: XMM6 and XMM7 contents preserved correctly.\n");
+ }
+#endif
+
  if ((what >= 0 && what <= 6) || what == 10) {
   printf("\n\n"
       "NB: If a function isn't optimised for a specific set of 
intructions,\n"
Index: src/bitstream/x86_asm/cbp_sse2.asm
===================================================================
RCS file: /xvid/xvidcore/src/bitstream/x86_asm/cbp_sse2.asm,v
retrieving revision 1.11
diff -u -r1.11 cbp_sse2.asm
--- src/bitstream/x86_asm/cbp_sse2.asm 1 Dec 2008 14:45:45 -0000 1.11
+++ src/bitstream/x86_asm/cbp_sse2.asm 2 Dec 2008 07:29:05 -0000
@@ -32,7 +32,7 @@

 %macro LOOP_SSE2 2
   movdqa xmm0, [%2+(%1)*128]
-  pand xmm0, xmm7
+  pand xmm0, xmm3
   movdqa xmm1, [%2+(%1)*128+16]

   por xmm0, [%2+(%1)*128+32]
@@ -43,7 +43,7 @@
   por xmm1, [%2+(%1)*128+112]

   por xmm0, xmm1        ; xmm0 = xmm1 = 128 bits worth of info
-  psadbw xmm0, xmm6     ; contains 2 dwords with sums
+  psadbw xmm0, xmm2     ; contains 2 dwords with sums
   movhlps xmm1, xmm0    ; move high dword from xmm0 to low xmm1
   por xmm0, xmm1        ; combine
   movd ecx, xmm0        ; if ecx set, values were found
@@ -76,10 +76,8 @@
   mov _EDX, prm1           ; coeff[]
   xor _EAX, _EAX           ; cbp = 0

-  PUSH_XMM6_XMM7
-
-  movdqu xmm7, [ignore_dc] ; mask to ignore dc value
-  pxor xmm6, xmm6          ; zero
+  movdqu xmm3, [ignore_dc] ; mask to ignore dc value
+  pxor xmm2, xmm2          ; zero

   LOOP_SSE2 0, _EDX
   jz .blk2
@@ -112,7 +110,6 @@

 .finished:

-  POP_XMM6_XMM7
   ret
 ENDFUNC

Index: src/image/x86_asm/gmc_mmx.asm
===================================================================
RCS file: /xvid/xvidcore/src/image/x86_asm/gmc_mmx.asm,v
retrieving revision 1.8
diff -u -r1.8 gmc_mmx.asm
--- src/image/x86_asm/gmc_mmx.asm 1 Dec 2008 14:45:45 -0000 1.8
+++ src/image/x86_asm/gmc_mmx.asm 2 Dec 2008 07:41:36 -0000
@@ -200,8 +200,6 @@

 align SECTION_ALIGN
 xvid_GMC_Core_Lin_8_sse41:
-  PUSH_XMM6_XMM7
-
   mov  _EAX, prm2  ; Offsets
   mov  TMP0, prm3  ; Src0
   mov  TMP1, prm4  ; BpS
@@ -218,7 +216,6 @@
   packuswb xmm5, xmm5
   movq [_EAX], xmm5

-  POP_XMM6_XMM7
   ret
 ENDFUNC

Index: src/image/x86_asm/postprocessing_sse2.asm
===================================================================
RCS file: /xvid/xvidcore/src/image/x86_asm/postprocessing_sse2.asm,v
retrieving revision 1.12
diff -u -r1.12 postprocessing_sse2.asm
--- src/image/x86_asm/postprocessing_sse2.asm 1 Dec 2008 15:00:44 -0000 1.12
+++ src/image/x86_asm/postprocessing_sse2.asm 2 Dec 2008 07:45:17 -0000
@@ -66,7 +66,6 @@

 ALIGN SECTION_ALIGN
 image_brightness_sse2:
-  PUSH_XMM6_XMM7
 %ifdef ARCH_IS_X86_64
   movsx _EAX, prm5d
 %else
@@ -79,14 +78,14 @@
   push _EDI    ; 8 bytes offset for push
   sub _ESP, 32 ; 32 bytes for local data (16bytes will be used, 16bytes 
more to align correctly mod 16)

-  movdqa xmm6, [xmm_0x80]
+  movdqa xmm2, [xmm_0x80]

   ; Create a offset...offset vector
   mov _ESI, _ESP          ; TMP1 will be esp aligned mod 16
   add _ESI, 15            ; TMP1 = esp + 15
   and _ESI, ~15           ; TMP1 = (esp + 15)&(~15)
   CREATE_OFFSET_VECTOR _ESI, al
-  movdqa xmm7, [_ESI]
+  movdqa xmm3, [_ESI]

 %ifdef ARCH_IS_X86_64
   mov _ESI, prm3
@@ -103,12 +102,12 @@
   movdqa xmm0, [TMP1 + _EAX]
   movdqa xmm1, [TMP1 + _EAX + 16] ; xmm0 = [dst]

-  paddb xmm0, xmm6              ; unsigned -> signed domain
-  paddb xmm1, xmm6
-  paddsb xmm0, xmm7
-  paddsb xmm1, xmm7             ; xmm0 += offset
-  psubb xmm0, xmm6
-  psubb xmm1, xmm6              ; signed -> unsigned domain
+  paddb xmm0, xmm2              ; unsigned -> signed domain
+  paddb xmm1, xmm2
+  paddsb xmm0, xmm3
+  paddsb xmm1, xmm3             ; xmm0 += offset
+  psubb xmm0, xmm2
+  psubb xmm1, xmm2              ; signed -> unsigned domain

   movdqa [TMP1 + _EAX], xmm0
   movdqa [TMP1 + _EAX + 16], xmm1 ; [dst] = xmm0
@@ -125,7 +124,6 @@
   pop _EDI
   pop _ESI

-  POP_XMM6_XMM7
   ret
 ENDFUNC
 ;//////////////////////////////////////////////////////////////////////
Index: src/motion/x86_asm/sad_sse2.asm
===================================================================
RCS file: /xvid/xvidcore/src/motion/x86_asm/sad_sse2.asm,v
retrieving revision 1.17
diff -u -r1.17 sad_sse2.asm
--- src/motion/x86_asm/sad_sse2.asm 1 Dec 2008 14:45:45 -0000 1.17
+++ src/motion/x86_asm/sad_sse2.asm 2 Dec 2008 07:56:52 -0000
@@ -63,18 +63,17 @@
   movdqa  xmm3, [_EAX+TMP0]
   lea _EAX,[_EAX+2*TMP0]
   psadbw  xmm0, xmm2
-  paddusw xmm6,xmm0
+  paddusw xmm4,xmm0
   psadbw  xmm1, xmm3
-  paddusw xmm6,xmm1
+  paddusw xmm4,xmm1
 %endmacro

 %macro SAD16_SSE2_SSE3 1
-  PUSH_XMM6_XMM7
   mov _EAX, prm1 ; cur (assumed aligned)
   mov TMP1, prm2 ; ref
   mov TMP0, prm3 ; stride

-  pxor xmm6, xmm6 ; accum
+  pxor xmm4, xmm4 ; accum

   SAD_16x16_SSE2 %1
   SAD_16x16_SSE2 %1
@@ -85,11 +84,10 @@
   SAD_16x16_SSE2 %1
   SAD_16x16_SSE2 %1

-  pshufd  xmm5, xmm6, 00000010b
-  paddusw xmm6, xmm5
-  pextrw  eax, xmm6, 0
+  pshufd  xmm5, xmm4, 00000010b
+  paddusw xmm4, xmm5
+  pextrw  eax, xmm4, 0

-  POP_XMM6_XMM7
   ret
 %endmacro

@@ -113,20 +111,19 @@
   %1 xmm0, [_EAX]
   %1 xmm1, [_EAX+TMP0]
   lea _EAX, [_EAX+2*TMP0]    ; + 2*stride
-  psadbw xmm0, xmm7
-  paddusw xmm6, xmm0
-  psadbw xmm1, xmm7
-  paddusw xmm6, xmm1
+  psadbw xmm0, xmm5
+  paddusw xmm4, xmm0
+  psadbw xmm1, xmm5
+  paddusw xmm4, xmm1
 %endmacro


 %macro MEAN16_SSE2_SSE3 1
-  PUSH_XMM6_XMM7
   mov _EAX, prm1   ; src
   mov TMP0, prm2   ; stride

-  pxor xmm6, xmm6     ; accum
-  pxor xmm7, xmm7     ; zero
+  pxor xmm4, xmm4     ; accum
+  pxor xmm5, xmm5     ; zero

   MEAN_16x16_SSE2 %1
   MEAN_16x16_SSE2 %1
@@ -140,13 +137,13 @@

   mov _EAX, prm1       ; src again

-  pshufd   xmm7, xmm6, 10b
-  paddusw  xmm7, xmm6
-  pxor     xmm6, xmm6     ; zero accum
-  psrlw    xmm7, 8        ; => Mean
-  pshuflw  xmm7, xmm7, 0  ; replicate Mean
-  packuswb xmm7, xmm7
-  pshufd   xmm7, xmm7, 00000000b
+  pshufd   xmm5, xmm4, 10b
+  paddusw  xmm5, xmm4
+  pxor     xmm4, xmm4     ; zero accum
+  psrlw    xmm5, 8        ; => Mean
+  pshuflw  xmm5, xmm5, 0  ; replicate Mean
+  packuswb xmm5, xmm5
+  pshufd   xmm5, xmm5, 00000000b

   MEAN_16x16_SSE2 %1
   MEAN_16x16_SSE2 %1
@@ -158,11 +155,10 @@
   MEAN_16x16_SSE2 %1
   MEAN_16x16_SSE2 %1

-  pshufd   xmm7, xmm6, 10b
-  paddusw  xmm7, xmm6
-  pextrw eax, xmm7, 0
+  pshufd   xmm5, xmm4, 10b
+  paddusw  xmm5, xmm4
+  pextrw eax, xmm5, 0

-  POP_XMM6_XMM7
   ret
 %endmacro

Index: src/plugins/x86_asm/plugin_ssim-a.asm
===================================================================
RCS file: /xvid/xvidcore/src/plugins/x86_asm/plugin_ssim-a.asm,v
retrieving revision 1.10
diff -u -r1.10 plugin_ssim-a.asm
--- src/plugins/x86_asm/plugin_ssim-a.asm 1 Dec 2008 14:45:46 -0000 1.10
+++ src/plugins/x86_asm/plugin_ssim-a.asm 2 Dec 2008 07:53:45 -0000
@@ -169,14 +169,6 @@
  pxor xmm6,xmm6;devc
  pxor xmm7,xmm7;corr

- ;broadcast lumo/c
- punpcklbw xmm6,xmm6
- punpcklwd xmm6,xmm6
- pshufd xmm6,xmm6,00000000b;or shufps
- punpcklbw xmm7,xmm7
- punpcklwd xmm7,xmm7
- pshufd xmm7,xmm7,00000000b
-
  CONSIM_1x8_SSE2
  add TMP0,_EAX
  add TMP1,_EAX
Index: src/quant/x86_asm/quantize_h263_mmx.asm
===================================================================
RCS file: /xvid/xvidcore/src/quant/x86_asm/quantize_h263_mmx.asm,v
retrieving revision 1.12
diff -u -r1.12 quantize_h263_mmx.asm
--- src/quant/x86_asm/quantize_h263_mmx.asm 1 Dec 2008 14:45:46 -0000 1.12
+++ src/quant/x86_asm/quantize_h263_mmx.asm 2 Dec 2008 08:34:44 -0000
@@ -247,7 +247,7 @@

 ALIGN SECTION_ALIGN
 quant_h263_intra_sse2:
-
+  PUSH_XMM6_XMM7
   mov _EAX, prm2     ; data

   movsx _EAX, word [_EAX]      ; data[0]
@@ -363,7 +363,7 @@
   mov TMP1, prm1     ; coeff
   mov [TMP1],ax
   xor _EAX,_EAX            ; return 0
-
+  POP_XMM6_XMM7
   ret
 ENDFUNC

@@ -491,8 +491,7 @@
   pxor xmm5, xmm5                           ; sum

   lea TMP0, [mmx_sub]
-  movq mm0, [TMP0 + _EAX*8 - 8]             ; sub
-  movq2dq xmm6, mm0                         ; load into low 8 bytes
+  movq xmm6, [TMP0 + _EAX*8 - 8]             ; sub
   movlhps xmm6, xmm6                        ; duplicate into high 8 bytes

   cmp al, 1
@@ -500,12 +499,11 @@

 .qes2_not1:
   lea TMP0, [mmx_div]
-  movq mm0, [TMP0 + _EAX*8 - 8]          ; divider
+  movq xmm7, [TMP0 + _EAX*8 - 8]          ; divider

   xor TMP0, TMP0
   mov _EAX, prm2      ; data

-  movq2dq xmm7, mm0
   movlhps xmm7, xmm7

 ALIGN SECTION_ALIGN
@@ -538,7 +536,7 @@
   jnz .qes2_loop

 .qes2_done:
-  movdqu xmm6, [plus_one]
+  movdqa xmm6, [plus_one]
   pmaddwd xmm5, xmm6
   movhlps xmm6, xmm5
   paddd xmm5, xmm6
Index: src/utils/emms.h
===================================================================
RCS file: /xvid/xvidcore/src/utils/emms.h,v
retrieving revision 1.16
diff -u -r1.16 emms.h
--- src/utils/emms.h 5 Jan 2005 23:02:15 -0000 1.16
+++ src/utils/emms.h 2 Dec 2008 08:17:53 -0000
@@ -57,6 +57,11 @@
 extern void sse2_os_trigger(void);
 #endif

+#if defined(ARCH_IS_X86_64) && defined(WIN32)
+extern void prime_xmm(void*);
+extern void get_xmm(void*);
+#endif
+
 #ifdef ARCH_IS_PPC
 extern void altivec_trigger(void);
 #endif
Index: src/utils/x86_asm/cpuid.asm
===================================================================
RCS file: /xvid/xvidcore/src/utils/x86_asm/cpuid.asm,v
retrieving revision 1.15
diff -u -r1.15 cpuid.asm
--- src/utils/x86_asm/cpuid.asm 26 Nov 2008 01:04:34 -0000 1.15
+++ src/utils/x86_asm/cpuid.asm 2 Dec 2008 08:17:36 -0000
@@ -221,6 +221,22 @@
   ret
 ENDFUNC

+%ifdef ARCH_IS_X86_64
+%ifdef WINDOWS
+cglobal prime_xmm
+prime_xmm:
+  movdqa xmm6, [prm1]
+  movdqa xmm7, [prm1+16]
+  ret
+ENDFUNC
+
+cglobal get_xmm
+get_xmm:
+  movdqa [prm1], xmm6
+  movdqa [prm1+16], xmm7
+  ret
+%endif
+%endif


 %ifidn __OUTPUT_FORMAT__,elf




More information about the Xvid-devel mailing list