[XviD-devel] Xvid 1.2.0 is available!
Andrew Dunstan
a_dunstan at hotmail.com
Tue Dec 2 11:17:47 CET 2008
*kicks mail client*
Let's try that again.
Index: examples/xvid_bench.c
===================================================================
RCS file: /xvid/xvidcore/examples/xvid_bench.c,v
retrieving revision 1.38
diff -u -r1.38 xvid_bench.c
--- examples/xvid_bench.c 26 Nov 2008 23:37:28 -0000 1.38
+++ examples/xvid_bench.c 2 Dec 2008 08:20:57 -0000
@@ -2205,6 +2205,13 @@
int width, height;
uint32_t chksum = 0;
const char * test_bitstream = 0;
+#if defined(WIN32) && defined(ARCH_IS_X86_64)
+ DECLARE_ALIGNED_MATRIX(xmm_save, 2, 4, uint64_t, 16);
+ // assumes xmm6 and xmm7 won't be falsely preserved by C code
+ for(c=0;c<4;c++)
+ xmm_save[c] = __rdtsc();
+ prime_xmm(xmm_save);
+#endif
cpu_mask = 0; // default => will use autodectect
for(c=1; c<argc; ++c)
@@ -2284,6 +2291,20 @@
if (what==-2)
test_quant_bug();
+#if defined(WIN32) && defined(ARCH_IS_X86_64)
+ get_xmm(xmm_save+4);
+ if (memcmp(xmm_save, xmm_save+4, 4*sizeof(int64_t))) {
+ printf("\nWIN64 ERROR: XMM6 and XMM7 contents not preserved!\n"
+ " XMM6 XMM7\n"
+ "Before: %.16I64X%.16I64X %.16I64X%.16I64X\n"
+ "After: %.16I64X%.16I64X %.16I64X%.16I64X",
+ xmm_save[0],xmm_save[1],xmm_save[2],xmm_save[3],
+ xmm_save[4],xmm_save[5],xmm_save[6],xmm_save[7]);
+ } else {
+ printf("\nWIN64: XMM6 and XMM7 contents preserved correctly.\n");
+ }
+#endif
+
if ((what >= 0 && what <= 6) || what == 10) {
printf("\n\n"
"NB: If a function isn't optimised for a specific set of
intructions,\n"
Index: src/bitstream/x86_asm/cbp_sse2.asm
===================================================================
RCS file: /xvid/xvidcore/src/bitstream/x86_asm/cbp_sse2.asm,v
retrieving revision 1.11
diff -u -r1.11 cbp_sse2.asm
--- src/bitstream/x86_asm/cbp_sse2.asm 1 Dec 2008 14:45:45 -0000 1.11
+++ src/bitstream/x86_asm/cbp_sse2.asm 2 Dec 2008 07:29:05 -0000
@@ -32,7 +32,7 @@
%macro LOOP_SSE2 2
movdqa xmm0, [%2+(%1)*128]
- pand xmm0, xmm7
+ pand xmm0, xmm3
movdqa xmm1, [%2+(%1)*128+16]
por xmm0, [%2+(%1)*128+32]
@@ -43,7 +43,7 @@
por xmm1, [%2+(%1)*128+112]
por xmm0, xmm1 ; xmm0 = xmm1 = 128 bits worth of info
- psadbw xmm0, xmm6 ; contains 2 dwords with sums
+ psadbw xmm0, xmm2 ; contains 2 dwords with sums
movhlps xmm1, xmm0 ; move high dword from xmm0 to low xmm1
por xmm0, xmm1 ; combine
movd ecx, xmm0 ; if ecx set, values were found
@@ -76,10 +76,8 @@
mov _EDX, prm1 ; coeff[]
xor _EAX, _EAX ; cbp = 0
- PUSH_XMM6_XMM7
-
- movdqu xmm7, [ignore_dc] ; mask to ignore dc value
- pxor xmm6, xmm6 ; zero
+ movdqu xmm3, [ignore_dc] ; mask to ignore dc value
+ pxor xmm2, xmm2 ; zero
LOOP_SSE2 0, _EDX
jz .blk2
@@ -112,7 +110,6 @@
.finished:
- POP_XMM6_XMM7
ret
ENDFUNC
Index: src/image/x86_asm/gmc_mmx.asm
===================================================================
RCS file: /xvid/xvidcore/src/image/x86_asm/gmc_mmx.asm,v
retrieving revision 1.8
diff -u -r1.8 gmc_mmx.asm
--- src/image/x86_asm/gmc_mmx.asm 1 Dec 2008 14:45:45 -0000 1.8
+++ src/image/x86_asm/gmc_mmx.asm 2 Dec 2008 07:41:36 -0000
@@ -200,8 +200,6 @@
align SECTION_ALIGN
xvid_GMC_Core_Lin_8_sse41:
- PUSH_XMM6_XMM7
-
mov _EAX, prm2 ; Offsets
mov TMP0, prm3 ; Src0
mov TMP1, prm4 ; BpS
@@ -218,7 +216,6 @@
packuswb xmm5, xmm5
movq [_EAX], xmm5
- POP_XMM6_XMM7
ret
ENDFUNC
Index: src/image/x86_asm/postprocessing_sse2.asm
===================================================================
RCS file: /xvid/xvidcore/src/image/x86_asm/postprocessing_sse2.asm,v
retrieving revision 1.12
diff -u -r1.12 postprocessing_sse2.asm
--- src/image/x86_asm/postprocessing_sse2.asm 1 Dec 2008 15:00:44 -0000 1.12
+++ src/image/x86_asm/postprocessing_sse2.asm 2 Dec 2008 07:45:17 -0000
@@ -66,7 +66,6 @@
ALIGN SECTION_ALIGN
image_brightness_sse2:
- PUSH_XMM6_XMM7
%ifdef ARCH_IS_X86_64
movsx _EAX, prm5d
%else
@@ -79,14 +78,14 @@
push _EDI ; 8 bytes offset for push
sub _ESP, 32 ; 32 bytes for local data (16bytes will be used, 16bytes
more to align correctly mod 16)
- movdqa xmm6, [xmm_0x80]
+ movdqa xmm2, [xmm_0x80]
; Create a offset...offset vector
mov _ESI, _ESP ; TMP1 will be esp aligned mod 16
add _ESI, 15 ; TMP1 = esp + 15
and _ESI, ~15 ; TMP1 = (esp + 15)&(~15)
CREATE_OFFSET_VECTOR _ESI, al
- movdqa xmm7, [_ESI]
+ movdqa xmm3, [_ESI]
%ifdef ARCH_IS_X86_64
mov _ESI, prm3
@@ -103,12 +102,12 @@
movdqa xmm0, [TMP1 + _EAX]
movdqa xmm1, [TMP1 + _EAX + 16] ; xmm0 = [dst]
- paddb xmm0, xmm6 ; unsigned -> signed domain
- paddb xmm1, xmm6
- paddsb xmm0, xmm7
- paddsb xmm1, xmm7 ; xmm0 += offset
- psubb xmm0, xmm6
- psubb xmm1, xmm6 ; signed -> unsigned domain
+ paddb xmm0, xmm2 ; unsigned -> signed domain
+ paddb xmm1, xmm2
+ paddsb xmm0, xmm3
+ paddsb xmm1, xmm3 ; xmm0 += offset
+ psubb xmm0, xmm2
+ psubb xmm1, xmm2 ; signed -> unsigned domain
movdqa [TMP1 + _EAX], xmm0
movdqa [TMP1 + _EAX + 16], xmm1 ; [dst] = xmm0
@@ -125,7 +124,6 @@
pop _EDI
pop _ESI
- POP_XMM6_XMM7
ret
ENDFUNC
;//////////////////////////////////////////////////////////////////////
Index: src/motion/x86_asm/sad_sse2.asm
===================================================================
RCS file: /xvid/xvidcore/src/motion/x86_asm/sad_sse2.asm,v
retrieving revision 1.17
diff -u -r1.17 sad_sse2.asm
--- src/motion/x86_asm/sad_sse2.asm 1 Dec 2008 14:45:45 -0000 1.17
+++ src/motion/x86_asm/sad_sse2.asm 2 Dec 2008 07:56:52 -0000
@@ -63,18 +63,17 @@
movdqa xmm3, [_EAX+TMP0]
lea _EAX,[_EAX+2*TMP0]
psadbw xmm0, xmm2
- paddusw xmm6,xmm0
+ paddusw xmm4,xmm0
psadbw xmm1, xmm3
- paddusw xmm6,xmm1
+ paddusw xmm4,xmm1
%endmacro
%macro SAD16_SSE2_SSE3 1
- PUSH_XMM6_XMM7
mov _EAX, prm1 ; cur (assumed aligned)
mov TMP1, prm2 ; ref
mov TMP0, prm3 ; stride
- pxor xmm6, xmm6 ; accum
+ pxor xmm4, xmm4 ; accum
SAD_16x16_SSE2 %1
SAD_16x16_SSE2 %1
@@ -85,11 +84,10 @@
SAD_16x16_SSE2 %1
SAD_16x16_SSE2 %1
- pshufd xmm5, xmm6, 00000010b
- paddusw xmm6, xmm5
- pextrw eax, xmm6, 0
+ pshufd xmm5, xmm4, 00000010b
+ paddusw xmm4, xmm5
+ pextrw eax, xmm4, 0
- POP_XMM6_XMM7
ret
%endmacro
@@ -113,20 +111,19 @@
%1 xmm0, [_EAX]
%1 xmm1, [_EAX+TMP0]
lea _EAX, [_EAX+2*TMP0] ; + 2*stride
- psadbw xmm0, xmm7
- paddusw xmm6, xmm0
- psadbw xmm1, xmm7
- paddusw xmm6, xmm1
+ psadbw xmm0, xmm5
+ paddusw xmm4, xmm0
+ psadbw xmm1, xmm5
+ paddusw xmm4, xmm1
%endmacro
%macro MEAN16_SSE2_SSE3 1
- PUSH_XMM6_XMM7
mov _EAX, prm1 ; src
mov TMP0, prm2 ; stride
- pxor xmm6, xmm6 ; accum
- pxor xmm7, xmm7 ; zero
+ pxor xmm4, xmm4 ; accum
+ pxor xmm5, xmm5 ; zero
MEAN_16x16_SSE2 %1
MEAN_16x16_SSE2 %1
@@ -140,13 +137,13 @@
mov _EAX, prm1 ; src again
- pshufd xmm7, xmm6, 10b
- paddusw xmm7, xmm6
- pxor xmm6, xmm6 ; zero accum
- psrlw xmm7, 8 ; => Mean
- pshuflw xmm7, xmm7, 0 ; replicate Mean
- packuswb xmm7, xmm7
- pshufd xmm7, xmm7, 00000000b
+ pshufd xmm5, xmm4, 10b
+ paddusw xmm5, xmm4
+ pxor xmm4, xmm4 ; zero accum
+ psrlw xmm5, 8 ; => Mean
+ pshuflw xmm5, xmm5, 0 ; replicate Mean
+ packuswb xmm5, xmm5
+ pshufd xmm5, xmm5, 00000000b
MEAN_16x16_SSE2 %1
MEAN_16x16_SSE2 %1
@@ -158,11 +155,10 @@
MEAN_16x16_SSE2 %1
MEAN_16x16_SSE2 %1
- pshufd xmm7, xmm6, 10b
- paddusw xmm7, xmm6
- pextrw eax, xmm7, 0
+ pshufd xmm5, xmm4, 10b
+ paddusw xmm5, xmm4
+ pextrw eax, xmm5, 0
- POP_XMM6_XMM7
ret
%endmacro
Index: src/plugins/x86_asm/plugin_ssim-a.asm
===================================================================
RCS file: /xvid/xvidcore/src/plugins/x86_asm/plugin_ssim-a.asm,v
retrieving revision 1.10
diff -u -r1.10 plugin_ssim-a.asm
--- src/plugins/x86_asm/plugin_ssim-a.asm 1 Dec 2008 14:45:46 -0000 1.10
+++ src/plugins/x86_asm/plugin_ssim-a.asm 2 Dec 2008 07:53:45 -0000
@@ -169,14 +169,6 @@
pxor xmm6,xmm6;devc
pxor xmm7,xmm7;corr
- ;broadcast lumo/c
- punpcklbw xmm6,xmm6
- punpcklwd xmm6,xmm6
- pshufd xmm6,xmm6,00000000b;or shufps
- punpcklbw xmm7,xmm7
- punpcklwd xmm7,xmm7
- pshufd xmm7,xmm7,00000000b
-
CONSIM_1x8_SSE2
add TMP0,_EAX
add TMP1,_EAX
Index: src/quant/x86_asm/quantize_h263_mmx.asm
===================================================================
RCS file: /xvid/xvidcore/src/quant/x86_asm/quantize_h263_mmx.asm,v
retrieving revision 1.12
diff -u -r1.12 quantize_h263_mmx.asm
--- src/quant/x86_asm/quantize_h263_mmx.asm 1 Dec 2008 14:45:46 -0000 1.12
+++ src/quant/x86_asm/quantize_h263_mmx.asm 2 Dec 2008 08:34:44 -0000
@@ -247,7 +247,7 @@
ALIGN SECTION_ALIGN
quant_h263_intra_sse2:
-
+ PUSH_XMM6_XMM7
mov _EAX, prm2 ; data
movsx _EAX, word [_EAX] ; data[0]
@@ -363,7 +363,7 @@
mov TMP1, prm1 ; coeff
mov [TMP1],ax
xor _EAX,_EAX ; return 0
-
+ POP_XMM6_XMM7
ret
ENDFUNC
@@ -491,8 +491,7 @@
pxor xmm5, xmm5 ; sum
lea TMP0, [mmx_sub]
- movq mm0, [TMP0 + _EAX*8 - 8] ; sub
- movq2dq xmm6, mm0 ; load into low 8 bytes
+ movq xmm6, [TMP0 + _EAX*8 - 8] ; sub
movlhps xmm6, xmm6 ; duplicate into high 8 bytes
cmp al, 1
@@ -500,12 +499,11 @@
.qes2_not1:
lea TMP0, [mmx_div]
- movq mm0, [TMP0 + _EAX*8 - 8] ; divider
+ movq xmm7, [TMP0 + _EAX*8 - 8] ; divider
xor TMP0, TMP0
mov _EAX, prm2 ; data
- movq2dq xmm7, mm0
movlhps xmm7, xmm7
ALIGN SECTION_ALIGN
@@ -538,7 +536,7 @@
jnz .qes2_loop
.qes2_done:
- movdqu xmm6, [plus_one]
+ movdqa xmm6, [plus_one]
pmaddwd xmm5, xmm6
movhlps xmm6, xmm5
paddd xmm5, xmm6
Index: src/utils/emms.h
===================================================================
RCS file: /xvid/xvidcore/src/utils/emms.h,v
retrieving revision 1.16
diff -u -r1.16 emms.h
--- src/utils/emms.h 5 Jan 2005 23:02:15 -0000 1.16
+++ src/utils/emms.h 2 Dec 2008 08:17:53 -0000
@@ -57,6 +57,11 @@
extern void sse2_os_trigger(void);
#endif
+#if defined(ARCH_IS_X86_64) && defined(WIN32)
+extern void prime_xmm(void*);
+extern void get_xmm(void*);
+#endif
+
#ifdef ARCH_IS_PPC
extern void altivec_trigger(void);
#endif
Index: src/utils/x86_asm/cpuid.asm
===================================================================
RCS file: /xvid/xvidcore/src/utils/x86_asm/cpuid.asm,v
retrieving revision 1.15
diff -u -r1.15 cpuid.asm
--- src/utils/x86_asm/cpuid.asm 26 Nov 2008 01:04:34 -0000 1.15
+++ src/utils/x86_asm/cpuid.asm 2 Dec 2008 08:17:36 -0000
@@ -221,6 +221,22 @@
ret
ENDFUNC
+%ifdef ARCH_IS_X86_64
+%ifdef WINDOWS
+cglobal prime_xmm
+prime_xmm:
+ movdqa xmm6, [prm1]
+ movdqa xmm7, [prm1+16]
+ ret
+ENDFUNC
+
+cglobal get_xmm
+get_xmm:
+ movdqa [prm1], xmm6
+ movdqa [prm1+16], xmm7
+ ret
+%endif
+%endif
%ifidn __OUTPUT_FORMAT__,elf
More information about the Xvid-devel
mailing list