On Wed, 21 Mar 2001, David Malone wrote:
> On Mon, Mar 19, 2001 at 02:47:34PM +1100, Bruce Evans wrote:
> > > npx.c already has one "fix" for the overflow problem. The problem
> > > is may be that clocks don't work early any more.
> >
> > It must be that microtime() doesn't work early any more.
I checked that microtime() doesn't work for more than 10 msec if it
uses the i8254. When it doesn't work for that long, the bandwidth
test breaks down for bzero() bandwidths smaller than 100 MB/sec. Such
bandwidths are normal for Intel i586's. E.g., my P5/133 has a
generic_bzero() bandwidth of 87e6 bytes/sec and an i586_bzero()
bandwidth of 174e6 bytes/sec. This is in userland with a slightly
improved i586_bzero() (39 cycles instead of 41 for the inner loop
IIRC) and with slightly improved page coloring, and a buffer size of
1MB (same as in the bandwidth test). So, the test always breaks down
for my P5/133 if microtime() uses the i8254. OTOH, my K6-1/233 has
bandwidths of 135e6 and 127e6 bytes/sec, respectively, so the test
never breaks down for it.
> I did a quick check, and it does seem that i586_bzero can be faster
> on the k6-2. I found it was about twice as fast for large buffers.
> This was timed in userland using the TSC. With a slightly simplified
> version of i586_bzero (I removed all the kernel specific stuff and
> had it always save the floating point state on the stack). A graph
> is at:
This is surprising.
> http://www.maths.tcd.ie/~dwmalone/comp/bzero-band.ps
>
> The graph seems to peak at about 160kB/s, which seems plausable.
160kB/sec is implausible :-). 160MB/sec is plausible. Half that
is hard to understand. Why is it slower than my K6-1? Ah, I
partly understand. My K6-1 has an L2 cache size of 1MB, so the
1MB buffer size is really too small for it if write allocation
is enabled. P5's don't have write allocation, so the buffer size
for them is not critical. All K6's have write allocation IIRC.
With a buffer size of 2MB, the bandwidths for my K6-1/233 are
84e6 and 80e6 bytes/sec, respectively. So 80MB/sec is plausible
and 160MB/sec is fast (it's equivalent to 320MB/sec without
write allocation).
These complications show how hard it is to write a single bandwidth
test that works for all i586's. I think the next step (after fixing
the i586 functions) should be to reduce the buffer size signicantly
and not worry about cache effects. Cache effects benefit generic_bzero()
in the bandwidth test but they probably benefit it in normal use too.
> The code is at:
>
> http://www.maths.tcd.ie/~dwmalone/comp/-time.S
> http://www.maths.tcd.ie/~dwmalone/comp/-time.c
>
> (It's crude, but seemed to produce moderately OK results. You get
> ocasional dips in the bandwidth due to using the tcs for timing.
> I only tried sizes which were a power of two, aswell...)
I wrote not-so-crude read/write/copy/checksum userland benchmarks to
test this stuff when I helped implement the i586-optimized routines.
Here is the write benchmark. Compile it with 'cc -aout'.
---
#include <sys/types.h>
#include <sys/time.h>
#include <sys/resource.h>
#include <machine/cpufunc.h>
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <unistd.h>
typedef void func_t(void *buf, size_t len);
struct func
{
func_t *fn;
char *name;
char *description;
};
static func_t zero0, zero1, zero2, zero3, zero4, zero5, zero6, zero7;
static func_t zero8, zero9, zeroA, zeroB, zeroC, zeroD;
static void usage(void);
static char const *progname;
static struct func funcs[] =
{
zero0, "zero0", "stosl",
zero1, "zero1", "unroll 16",
zero2, "zero2", "unroll 16 preallocate",
zero3, "zero3", "unroll 32",
zero4, "zero4", "unroll 32 preallocate",
zero5, "zero5", "unroll 64",
zero6, "zero6", "unroll 64 preallocate",
zero7, "zero7", "fstl",
zero8, "zero8", "movl",
zero9, "zero9", "unroll 8",
zeroA, "zeroA", "generic_bzero",
zeroB, "zeroB", "i486_bzero",
zeroC, "zeroC", "i586_bzero",
zeroD, "zeroD", "i686_pagezero",
bzero, "zeroE", "bzero (stosl)",
};
#define NFUNC (sizeof funcs / sizeof funcs[0])
int main(int argc, char **argv)
{
unsigned char *buf;
int ch;
int funcn;
int funcnspecified;
int i586;
size_t len;
size_t max;
int precache;
int quiet;
size_t thrashbufsize;
unsigned long long tot;
progname = argv[0];
funcnspecified = -1;
i586 = 0;
len = 4096;
precache = 0;
quiet = 0;
tot = 100000000;
while ((ch = getopt(argc, argv, "5f:l:pqt:")) != EOF)
{
switch (ch)
{
case '5':
i586 = 1;
break;
case 'f':
funcnspecified = strtoul(optarg, (char **) NULL, 0);
if (funcnspecified < 0 || funcnspecified >= NFUNC)
usage();
break;
case 'l':
len = strtoul(optarg, (char **) NULL, 0);
break;
case 'p':
precache = 1;
break;
case 'q':
quiet = 1;
break;
case 't':
tot = strtouq(optarg, (char **) NULL, 0);
break;
default:
usage();
}
}
if (optind != argc)
usage();
buf = malloc(len + 4096);
if (buf == NULL)
{
fprintf(stderr, "%s: malloc failed\n", progname);
exit(1);
}
max = tot / len;
tot = (unsigned long long) max * len;
for (funcn = 0; funcn < NFUNC; ++funcn)
{
func_t *funcp;
struct rusage finish;
size_t i;
struct rusage start;
unsigned long long tsc;
long usec;
if (funcnspecified != -1 && funcnspecified != funcn)
continue;
/*
* Check the function. As side effects, make sure that the buffer
* isn't a constant zero page, and leave as much of the buffer as
* possible in the cache to set up the `precache' case.
*/
memset(buf, 1, len);
funcp = funcs[funcn].fn;
funcp(buf, len);
#if 1
for (i = 0; i < len; ++i)
if (buf[i] != '\0')
{
fprintf(stderr, "%s: %s failed at %u\n",
progname, funcs[funcn].name, i);
exit(1);
}
#endif
if (!precache)
/*
* Attempt to uncache the buffer so as to provide the same
* uncached environnment for all the functions.
*/
for (thrashbufsize = 2 * 1024 * 1024; thrashbufsize != 0;
thrashbufsize /= 2)
{
unsigned char *thrashbuf1;
unsigned char *thrashbuf2;
thrashbuf1 = malloc(thrashbufsize);
thrashbuf2 = malloc(thrashbufsize);
if (thrashbuf1 != NULL && thrashbuf2 != NULL)
{
memcpy(thrashbuf2, thrashbuf1, thrashbufsize);
memcpy(thrashbuf1, thrashbuf2, thrashbufsize);
}
free(thrashbuf1);
free(thrashbuf2);
}
tsc = 0;
getrusage(RUSAGE_SELF, &start);
if (i586)
tsc = rdtsc();
#if 1
for (i = 0; i < max; ++i)
funcp(buf, len);
#else
tot /= 8 * 8;
tot *= 8 * 8;
for (i = 0; i < max / 8 / 8; ++i)
{
int j, k;
for (j = 0; j < 8; ++j)
for (k = 0; k < 8; ++k)
funcp(buf + j, len + k);
}
#endif
if (i586)
tsc = rdtsc() - tsc;
getrusage(RUSAGE_SELF, &finish);
usec = 1000000 * (finish.ru_utime.tv_sec - start.ru_utime.tv_sec)
+ finish.ru_utime.tv_usec - start.ru_utime.tv_usec;
if (usec < 0)
usec = 1;
printf("%s: %10.0f B/s", funcs[funcn].name, tot * 1e6 / usec);
if (!quiet)
{
printf(" (%7ld us)", usec);
if (i586)
printf(" (%9qd tsc)", tsc);
printf(" (%s)", funcs[funcn].description);
}
printf("\n");
}
return 0;
}
static void zero0(void *buf, size_t len)
{
asm volatile("
.p2align 4,0x90
cld
shrl $2,%1
rep; stosl"
: "=D" (buf), "=c" (len)
: "0" (buf), "1" (len), "a" (0)
: "memory");
}
static void zero1(void *buf, size_t len)
{
asm volatile("
.p2align 4,0x90
1:
movl %4,0(%0)
movl %4,4(%0)
movl %4,8(%0)
movl %4,12(%0)
addl $16,%0
subl $16,%1
ja 1b
"
: "=r" (buf), "=r" (len)
: "0" (buf), "1" (len), "r" (0)
: "memory");
}
static void zero2(void *buf, size_t len)
{
unsigned preallocate;
asm volatile("
.p2align 4,0x90
1:
movl (%0),%2
movl %5,0(%0)
movl %5,4(%0)
movl %5,8(%0)
movl %5,12(%0)
addl $16,%0
subl $16,%1
ja 1b
"
: "=r" (buf), "=r" (len), "=&r" (preallocate)
: "0" (buf), "1" (len), "r" (0)
: "memory");
}
static void zero3(void *buf, size_t len)
{
asm volatile("
.p2align 4,0x90
1:
movl %4,0(%0)
movl %4,4(%0)
movl %4,8(%0)
movl %4,12(%0)
movl %4,16(%0)
movl %4,20(%0)
movl %4,24(%0)
movl %4,28(%0)
addl $32,%0
subl $32,%1
ja 1b
"
: "=r" (buf), "=r" (len)
: "0" (buf), "1" (len), "r" (0)
: "memory");
}
static void zero4(void *buf, size_t len)
{
unsigned preallocate;
asm volatile("
.p2align 4,0x90
1:
movl (%0),%2
movl %5,0(%0)
movl %5,4(%0)
movl %5,8(%0)
movl %5,12(%0)
movl %5,16(%0)
movl %5,20(%0)
movl %5,24(%0)
movl %5,28(%0)
addl $32,%0
subl $32,%1
ja 1b
"
: "=r" (buf), "=r" (len), "=&r" (preallocate)
: "0" (buf), "1" (len), "r" (0)
: "memory");
}
static void zero5(void *buf, size_t len)
{
asm volatile("
.p2align 4,0x90
1:
movl %4,0(%0)
movl %4,4(%0)
movl %4,8(%0)
movl %4,12(%0)
movl %4,16(%0)
movl %4,20(%0)
movl %4,24(%0)
movl %4,28(%0)
movl %4,32(%0)
movl %4,36(%0)
movl %4,40(%0)
movl %4,44(%0)
movl %4,48(%0)
movl %4,52(%0)
movl %4,56(%0)
movl %4,60(%0)
addl $64,%0
subl $64,%1
ja 1b"
: "=r" (buf), "=r" (len)
: "0" (buf), "1" (len), "r" (0)
: "memory");
}
static void zero6(void *buf, size_t len)
{
void *buf2;
unsigned preallocate;
/*
* The main loop has 11 pairs of i586 instructions with no AGI so that
* it takes 11 cycles on i586's if all the data is in the L1 cache.
*
* On an ASUS P55TP4XE P133 the speeds are approx:
* data in L1 cache: 740,000,000 B/s
* data in L2 cache only: 90,000,000 B/s (highly variant)
* data not in any cache: 60,000,000 B/s
* and without preallocating (function zero5) they are:
* data in L1 cache: 87,000,000 B/s
* data in L2 cache only: 87,000,000 B/s
* data not in any cache: 90,000,000 B/s
*
* Thus the instruction selection and ordering optimizations have an
* insignificant effect if the data isn't in the L1 cache or the L2
* cache, and preallocating is a pessimization if the data isn't in the
* L2 cache.
*/
asm volatile("
.p2align 4,0x90
1:
movl (%0),%3
leal 32(%0),%2
movl %6,0(%0)
movl %6,4(%0)
movl %6,8(%0)
movl %6,12(%0)
movl %6,16(%0)
movl %6,20(%0)
movl %6,24(%0)
movl %6,28(%0)
movl (%2),%3
addl $64,%0
movl %6,0(%2)
movl %6,4(%2)
movl %6,8(%2)
movl %6,12(%2)
movl %6,16(%2)
movl %6,20(%2)
movl %6,24(%2)
movl %6,28(%2)
subl $64,%1
ja 1b
"
: "=r" (buf), "=r" (len), "=&r" (buf2), "=&r" (preallocate)
: "0" (buf), "1" (len), "r" (0)
: "memory");
}
static void zero7(void *buf, size_t len)
{
/*
* On a P55TP4XE P133, `fstl' goes slower than all the loop control
* instructions put together, so unrolling would be bad.
*/
asm volatile("
fldz
.p2align 4,0x90
1:
fstl 0(%0)
addl $8,%0
subl $8,%1
ja 1b
fstp %%st(0)"
: "=r" (buf), "=r" (len)
: "0" (buf), "1" (len)
: "memory");
}
static void zero8(void *buf, size_t len)
{
asm volatile("
.p2align 4,0x90
1:
movl $0,0(%0)
addl $4,%0
subl $4,%1
ja 1b
"
: "=r" (buf), "=r" (len)
: "0" (buf), "1" (len), "r" (0)
: "memory");
}
static void zero9(void *buf, size_t len)
{
asm volatile("
.p2align 4,0x90
1:
movl $0,0(%0)
movl $0,4(%0)
addl $8,%0
subl $8,%1
ja 1b
"
: "=r" (buf), "=r" (len)
: "0" (buf), "1" (len), "r" (0)
: "memory");
}
asm("
.p2align 2,0x90
_zeroA:
pushl %edi
movl 8(%esp),%edi
movl 12(%esp),%ecx
xorl %eax,%eax
shrl $2,%ecx
cld
rep
stosl
movl 12(%esp),%ecx
andl $3,%ecx
rep
stosb
popl %edi
ret
");
asm("
.p2align 2,0x90
_zeroB:
movl 4(%esp),%edx
movl 8(%esp),%ecx
xorl %eax,%eax
2:
cmpl $64,%ecx
jb 3f
movl %eax,(%edx)
movl %eax,4(%edx)
movl %eax,8(%edx)
movl %eax,12(%edx)
movl %eax,16(%edx)
movl %eax,20(%edx)
movl %eax,24(%edx)
movl %eax,28(%edx)
movl %eax,32(%edx)
movl %eax,36(%edx)
movl %eax,40(%edx)
movl %eax,44(%edx)
movl %eax,48(%edx)
movl %eax,52(%edx)
movl %eax,56(%edx)
movl %eax,60(%edx)
addl $64,%edx
subl $64,%ecx
jnz 2b
ret
.p2align 4,0x90
3:
cmpl $16,%ecx
jb 4f
movl %eax,(%edx)
movl %eax,4(%edx)
movl %eax,8(%edx)
movl %eax,12(%edx)
addl $16,%edx
subl $16,%ecx
jnz 3b
ret
.p2align 4,0x90
4:
cmpl $4,%ecx
jb 5f
movl %eax,(%edx)
addl $4,%edx
subl $4,%ecx
jnz 4b
ret
.data
jtab:
.long do0
.long do1
.long do2
.long do3
.text
.p2align 4,0x90
5:
jmp jtab(,%ecx,4)
.p2align 4,0x90
do3:
movw %ax,(%edx)
movb %al,2(%edx)
ret
.p2align 4,0x90
do2:
movw %ax,(%edx)
ret
.p2align 4,0x90
do1:
movb %al,(%edx)
ret
.p2align 4,0x90
do0:
ret
");
int npxproc = 0;
int kernel_fpu_lock = 0xfe;
asm("
_zeroC:
movl 4(%esp),%edx
movl 8(%esp),%ecx
cmpl $176,%ecx # 112 in kernel; 104-136 without hair
jb intreg_i586_bzero
cmpl $0,_npxproc
je i586_bz1
cmpl $176+184,%ecx # 112+184 in kernel; 320 without hair
jb intreg_i586_bzero
sarb $1,_kernel_fpu_lock
jc intreg_i586_bzero
smsw %ax
# clts
nop; nop
nop; nop
nop; nop
nop; nop
nop; nop
nop; nop
nop; nop
nop; nop
nop; nop
nop; nop
nop; nop
subl $108,%esp
fnsave 0(%esp)
jmp i586_bz2
i586_bz1:
sarb $1,_kernel_fpu_lock
jc intreg_i586_bzero
smsw %ax
# clts
nop; nop
nop; nop
nop; nop
nop; nop
nop; nop
nop; nop
nop; nop
nop; nop
nop; nop
nop; nop
nop; nop
fninit
i586_bz2:
fldz
fstl 0(%edx)
addl %edx,%ecx
addl $8,%edx
andl $~7,%edx
subl %edx,%ecx
fstl -8(%edx,%ecx)
decl %ecx
andl $~7,%ecx
fpureg_i586_bzero_loop:
fstl 0(%edx)
addl $8,%edx
subl $8,%ecx
cmpl $8,%ecx
jae fpureg_i586_bzero_loop
cmpl $0,_npxproc
je i586_bz3
frstor 0(%esp)
addl $108,%esp
# lmsw %ax
nop; nop
nop; nop
nop; nop
nop; nop
nop; nop
nop; nop
nop; nop
nop; nop
nop; nop
nop; nop
nop; nop
nop; nop
nop; nop
nop; nop
nop; nop
nop; nop
nop; nop
nop; nop
movb $0xfe,_kernel_fpu_lock
ret
i586_bz3:
fstpl %st(0)
# lmsw %ax
nop; nop
nop; nop
nop; nop
nop; nop
nop; nop
nop; nop
nop; nop
nop; nop
nop; nop
nop; nop
nop; nop
nop; nop
nop; nop
nop; nop
nop; nop
nop; nop
nop; nop
nop; nop
movb $0xfe,_kernel_fpu_lock
ret
intreg_i586_bzero:
cmpl $8,%ecx
jb i586_bz4
movl $0,(%edx)
movl $0,4(%edx)
addl %edx,%ecx
addl $8,%edx
andl $~7,%edx
subl %edx,%ecx
cmpl $8,%ecx
jb i586_bz4
intreg_i586_bzero_loop:
movl $0,0(%edx)
movl $0,4(%edx)
addl $8,%edx
subl $8,%ecx
cmpl $8,%ecx
jae intreg_i586_bzero_loop
nop
i586_bz4:
cmpl $4,%ecx
jb i586_bz5
movl $0,(%edx)
addl $4,%edx
subl $4,%ecx
i586_bz5:
cmpl $2,%ecx
jb i586_bz6
movw $0,(%edx)
addl $2,%edx
subl $2,%ecx
i586_bz6:
cmpl $1,%ecx
jb i586_bz7
movb $0,(%edx)
i586_bz7:
ret
");
asm("
_zeroD1:
pushl %edi
movl 8(%esp), %edi
movl 12(%esp), %ecx
shrl $2, %ecx
cld
.p2align 2,0x90
1:
jmp 2f
movl (%edi), %eax
orl 4(%edi), %eax
orl 8(%edi), %eax
orl 12(%edi), %eax
orl 16(%edi), %eax
orl 20(%edi), %eax
orl 24(%edi), %eax
orl 28(%edi), %eax
jne 2f
addl $32, %edi
subl $32/4, %ecx
jne 1b
popl %edi
ret
.p2align 2,0x90
3:
leal -32/4(%ecx), %edx
xorl %eax, %eax
movl $32/4, %ecx
rep
stosl
addl %edx, %ecx
jne 1b
popl %edi
ret
.p2align 2,0x90
2:
movl $0, (%edi)
movl $0, 4(%edi)
movl $0, 8(%edi)
movl $0, 12(%edi)
movl $0, 16(%edi)
movl $0, 20(%edi)
movl $0, 24(%edi)
movl $0, 28(%edi)
addl $32, %edi
subl $32/4, %ecx
jne 1b
popl %edi
ret
");
asm("
_zeroD:
movl 4(%esp), %edx
movl 8(%esp), %ecx
shrl $2, %ecx
.p2align 2,0x90
1:
movl (%edx), %eax
orl 4(%edx), %eax
orl 8(%edx), %eax
orl 12(%edx), %eax
orl 16(%edx), %eax
orl 20(%edx), %eax
orl 24(%edx), %eax
orl 28(%edx), %eax
jne 2f
addl $32, %edx
subl $32/4, %ecx
jne 1b
ret
.p2align 2,0x90
2:
movl $0, (%edx)
movl $0, 4(%edx)
movl $0, 8(%edx)
movl $0, 12(%edx)
movl $0, 16(%edx)
movl $0, 20(%edx)
movl $0, 24(%edx)
movl $0, 28(%edx)
addl $32, %edx
subl $32/4, %ecx
jne 1b
ret
");
static void usage(void)
{
fprintf(stderr, "%s: [-5cpq] [-f function] [-l length] [-t tot]\n",
progname);
exit(1);
}
---
Bruce
To Unsubscribe: send mail to [EMAIL PROTECTED]
with "unsubscribe freebsd-current" in the body of the message