[XviD-devel] New Trellis Quant
Christoph Lampert
chl at math.uni-bonn.de
Tue May 13 15:55:43 CEST 2003
On Mon, 12 May 2003, James Bilotto wrote:
> On Mon, May 12, 2003 at 11:27:34AM +0200, Christoph Lampert wrote:
> > P.S. Okay, maybe the main advantage would be that handwritten bzero/memset
> > could be inlined. After all, this routine is called 6 to 12 times per
> > macroblock.
> >
> yes thows speeds are about the same as what i get, and now i find that in
> freebsd at lest memset() is done with bzero() & bcopy(). could i see the code
> you used for the test?
It's just a plain framework for measuring speed of small routines, but
sure, here it is.
I guess I modified it again a couple of times since then, but here it is.
On gcc you have to compile with -O2 instead of -O3, otherwise the unrolled
routines are simply removed (or called just once).
gruel
-------------- next part --------------
/* -*- c-file-style: "linux" -*- */
/* memcpy speed benchmark using different i86-specific routines.
*
* Framework (C) 2001 by Martin Pool <mbp at samba.org>, based on speed.c
* by tridge.
*
* used to be for measuring memcpy speed
* modified a couple of times by Christoph Lampert <gruel(at)gmx.de>
* this time for memset(p,0,128);
*
*/
#include <stdio.h>
#include <math.h>
#include <stdlib.h>
#include <sys/time.h>
#define MAX(a,b) ((a)>(b)?(a):(b))
#define MIN(a,b) ((a)<(b)?(a):(b))
#include <sys/resource.h>
struct rusage tp1,tp2;
static void start_timer()
{
getrusage(RUSAGE_SELF,&tp1);
}
static long end_timer()
{
getrusage(RUSAGE_SELF,&tp2);
#if 0
printf ("tp1 = %ld.%05ld, tp2 = %ld.%05ld\n",
(long) tp1.ru_utime.tv_sec, (long) tp1.ru_utime.tv_usec,
(long) tp2.ru_utime.tv_sec, (long) tp2.ru_utime.tv_usec);
#endif
return ((tp2.ru_utime.tv_sec - tp1.ru_utime.tv_sec) * 1000000 +
(tp2.ru_utime.tv_usec - tp1.ru_utime.tv_usec));
}
/*
* By Ingo Molnar and Doug Ledford; hacked up to remove
* kernel-specific stuff like saving/restoring float registers.
*
* http://people.redhat.com/mingo/mmx-patches/mmx-2.3.99-A0 */
static void print_time (char const *msg,
long long loops,
long long bytes,
long t)
{
printf(" %-50s %ld.%06ld s %ld MB/s\n", msg, t/1000000,
t % 1000000, (1000000*bytes)/t/(1024*1024));
}
static void benchmark_memset (char *p, size_t size, long loops)
{
int i;
long t;
start_timer();
for (i=0;i<loops;i++)
{
memset(p,0x00,size);
}
t = end_timer();
print_time ("glibc memset()", loops, size*loops, t);
}
static void benchmark_bzero (char *p, size_t size, long loops)
{
int i;
long t;
start_timer();
for (i=0;i<loops;i++)
{
bzero(p,size);
}
t = end_timer();
print_time ("glibc bzero()", loops, size*loops, t);
}
static inline void mmx_memset (char *p)
{
__asm__ __volatile__ (
"1: pxor %%mm0, %%mm0\n"
"2: movq %%mm0, (%0)\n"
" movq %%mm0, 8(%0)\n"
" movq %%mm0, 16(%0)\n"
" movq %%mm0, 24(%0)\n"
" movq %%mm0, 32(%0)\n"
" movq %%mm0, 40(%0)\n"
" movq %%mm0, 48(%0)\n"
" movq %%mm0, 56(%0)\n"
" movq %%mm0, 64(%0)\n"
" movq %%mm0, 72(%0)\n"
" movq %%mm0, 80(%0)\n"
" movq %%mm0, 88(%0)\n"
" movq %%mm0, 96(%0)\n"
" movq %%mm0, 104(%0)\n"
" movq %%mm0, 112(%0)\n"
" movq %%mm0, 120(%0)\n"
: : "r" (p));
}
static void benchmark_mmx (char *p,
size_t size,
long loops)
{
int i;
long t;
start_timer();
for (i=0; i<loops; i++)
{
mmx_memset(p);
}
t = end_timer();
print_time ("MMX", loops, size*loops, t);
}
static void double_memset (char *p) /* if sizeof(double)==8 */
{ /* 128 byte = 16*8 byte */
((double*)p)[0]=0;
((double*)p)[1]=0;
((double*)p)[2]=0;
((double*)p)[3]=0;
((double*)p)[4]=0;
((double*)p)[5]=0;
((double*)p)[6]=0;
((double*)p)[7]=0;
((double*)p)[8]=0;
((double*)p)[9]=0;
((double*)p)[10]=0;
((double*)p)[11]=0;
((double*)p)[12]=0;
((double*)p)[13]=0;
((double*)p)[14]=0;
((double*)p)[15]=0;
}
static void longdouble_memset (char *p) /* if sizeof(long double)==12 */
{ /* 128 bytes = 10*12 + 8 bytes */
((long double*)p)[0]=0;
((long double*)p)[1]=0;
((long double*)p)[2]=0;
((long double*)p)[3]=0;
((long double*)p)[4]=0;
((long double*)p)[5]=0;
((long double*)p)[6]=0;
((long double*)p)[7]=0;
((long double*)p)[8]=0;
((long double*)p)[9]=0;
((double*)p)[15]=0; /* remaining 8 bytes */
}
static void longlong_memset (char *p)
{ //28 byte = 16*8 byte
((long long*)p)[0]=0;
((long long*)p)[1]=0;
((long long*)p)[2]=0;
((long long*)p)[3]=0;
((long long*)p)[4]=0;
((long long*)p)[5]=0;
((long long*)p)[6]=0;
((long long*)p)[7]=0;
((long long*)p)[8]=0;
((long long*)p)[9]=0;
((long long*)p)[10]=0;
((long long*)p)[11]=0;
((long long*)p)[12]=0;
((long long*)p)[13]=0;
((long long*)p)[14]=0;
((long long*)p)[15]=0;
}
static void long_memset (char *p)
{ /* 128 byte = 32*4 byte */
((long*)p)[0]=0;
((long*)p)[1]=0;
((long*)p)[2]=0;
((long*)p)[3]=0;
((long*)p)[4]=0;
((long*)p)[5]=0;
((long*)p)[6]=0;
((long*)p)[7]=0;
((long*)p)[8]=0;
((long*)p)[9]=0;
((long*)p)[10]=0;
((long*)p)[11]=0;
((long*)p)[12]=0;
((long*)p)[13]=0;
((long*)p)[14]=0;
((long*)p)[15]=0;
((long*)p)[16]=0;
((long*)p)[17]=0;
((long*)p)[18]=0;
((long*)p)[19]=0;
((long*)p)[20]=0;
((long*)p)[21]=0;
((long*)p)[22]=0;
((long*)p)[23]=0;
((long*)p)[24]=0;
((long*)p)[25]=0;
((long*)p)[26]=0;
((long*)p)[27]=0;
((long*)p)[28]=0;
((long*)p)[29]=0;
((long*)p)[30]=0;
((long*)p)[31]=0;
}
static void benchmark_longdouble (char *p,
size_t size,
long loops)
{
int i;
long t;
start_timer();
for (i=0; i<loops; i++)
{
longdouble_memset(p);
}
t = end_timer();
print_time ("longdouble C", loops, size*loops, t);
}
static void benchmark_double (char *p,
size_t size,
long loops)
{
int i;
long t;
start_timer();
for (i=0; i<loops; i++)
{
double_memset(p);
}
t = end_timer();
print_time ("double C", loops, size*loops, t);
}
static void benchmark_longlong (char *p,
size_t size,
long loops)
{
int i;
long t;
start_timer();
for (i=0; i<loops; i++)
{
longlong_memset(p);
}
t = end_timer();
print_time ("longlong C", loops, size*loops, t);
}
static void benchmark_long (char *p,
size_t size,
long loops)
{
int i;
long t;
start_timer();
for (i=0; i<loops; i++)
{
long_memset(p);
}
t = end_timer();
print_time ("long C", loops, size*loops, t);
}
static void memset_test(size_t size)
{
long loops = 500000000L / size;
/* We need to make sure the blocks are *VERY* aligned, because
MMX is potentially pretty fussy. */
char *p = (char *) valloc (size+64);
if (size > 2048)
printf ("memset %dkB -- %ld loops\n", size>>10, loops);
else
printf ("memset %dB -- %ld loops\n", size, loops);
printf (" aligned blocks\n");
benchmark_memset (p, size, loops);
benchmark_bzero (p, size, loops);
benchmark_double (p, size, loops);
benchmark_longdouble (p, size, loops);
benchmark_longlong (p, size, loops);
benchmark_long(p, size, loops);
benchmark_mmx (p, size, loops);
printf (" unaligned blocks\n");
benchmark_memset (p+3, size, loops);
benchmark_bzero (p+3, size, loops);
benchmark_double (p+3, size, loops);
benchmark_longdouble (p+3, size, loops);
benchmark_longlong (p+3, size, loops);
benchmark_long(p+3, size, loops);
benchmark_mmx (p+3, size, loops);
// wrap (p1, p2, size, loops, benchmark_arjanv,
// "arjanv's MOVQ (with prefetch)");
free(p);
}
int main (void)
{
memset_test(128);
return 0;
}
/* result gcc -O2 setspeed.c
in fact, all non-MMX-routines compile to the same ASM code (setting 4bytes at a time).
memset 128B -- 3906250 loops
aligned blocks
glibc memset() 1.290000 s 369 MB/s
glibc bzero() 1.270000 s 375 MB/s
double C 0.460000 s 1036 MB/s
longdouble C 0.470000 s 1014 MB/s
longlong C 0.460000 s 1036 MB/s
long C 0.470000 s 1014 MB/s
MMX 0.170000 s 2804 MB/s
unaligned blocks
glibc memset() 1.290000 s 369 MB/s
glibc bzero() 1.260000 s 378 MB/s
double C 0.820000 s 581 MB/s
longdouble C 0.740000 s 644 MB/s
longlong C 0.810000 s 588 MB/s
long C 0.810000 s 588 MB/s
MMX 0.410000 s 1163 MB/s
*/
More information about the XviD-devel
mailing list