On Wed, 21 Mar 2001, David Malone wrote:

> On Mon, Mar 19, 2001 at 02:47:34PM +1100, Bruce Evans wrote:
> > > npx.c already has one "fix" for the overflow problem.  The problem
> > > is may be that clocks don't work early any more.
> > 
> > It must be that microtime() doesn't work early any more.

I checked that microtime() doesn't work for more than 10 msec if it
uses the i8254.  When it doesn't work for that long, the bandwidth
test breaks down for bzero() bandwidths smaller than 100 MB/sec.  Such
bandwidths are normal for Intel i586's.  E.g., my P5/133 has a
generic_bzero() bandwidth of 87e6 bytes/sec and an i586_bzero()
bandwidth of 174e6 bytes/sec.  This is in userland with a slightly
improved i586_bzero() (39 cycles instead of 41 for the inner loop
IIRC) and with slightly improved page coloring, and a buffer size of
1MB (same as in the bandwidth test).  So, the test always breaks down
for my P5/133 if microtime() uses the i8254.  OTOH, my K6-1/233 has
bandwidths of 135e6 and 127e6 bytes/sec, respectively, so the test
never breaks down for it.

> I did a quick check, and it does seem that i586_bzero can be faster
> on the k6-2. I found it was about twice as fast for large buffers.
> This was timed in userland using the TSC. With a slightly simplified
> version of i586_bzero (I removed all the kernel specific stuff and
> had it always save the floating point state on the stack). A graph
> is at:

This is surprising.

>       http://www.maths.tcd.ie/~dwmalone/comp/bzero-band.ps
> 
> The graph seems to peak at about 160kB/s, which seems plausable.

160kB/sec is implausible :-).  160MB/sec is plausible.  Half that
is hard to understand.  Why is it slower than my K6-1?  Ah, I
partly understand.  My K6-1 has an L2 cache size of 1MB, so the
1MB buffer size is really too small for it if write allocation
is enabled.  P5's don't have write allocation, so the buffer size
for them is not critical.  All K6's have write allocation IIRC.
With a buffer size of 2MB, the bandwidths for my K6-1/233 are
84e6 and 80e6 bytes/sec, respectively.  So 80MB/sec is plausible
and 160MB/sec is fast (it's equivalent to 320MB/sec without
write allocation).

These complications show how hard it is to write a single bandwidth
test that works for all i586's.  I think the next step (after fixing
the i586 functions) should be to reduce the buffer size signicantly
and not worry about cache effects.  Cache effects benefit generic_bzero()
in the bandwidth test but they probably benefit it in normal use too.

> The code is at:
> 
>       http://www.maths.tcd.ie/~dwmalone/comp/-time.S
>       http://www.maths.tcd.ie/~dwmalone/comp/-time.c
> 
> (It's crude, but seemed to produce moderately OK results. You get
> ocasional dips in the bandwidth due to using the tcs for timing.
> I only tried sizes which were a power of two, aswell...)

I wrote not-so-crude read/write/copy/checksum userland benchmarks to
test this stuff when I helped implement the i586-optimized routines.
Here is the write benchmark.  Compile it with 'cc -aout'.

---
#include <sys/types.h>
#include <sys/time.h>
#include <sys/resource.h>

#include <machine/cpufunc.h>

#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <unistd.h>

typedef void func_t(void *buf, size_t len);

struct func
{
    func_t *fn;
    char *name;
    char *description;
};

static func_t zero0, zero1, zero2, zero3, zero4, zero5, zero6, zero7;
static func_t zero8, zero9, zeroA, zeroB, zeroC, zeroD;
static void usage(void);

static char const *progname;

static struct func funcs[] =
{
    zero0, "zero0", "stosl",
    zero1, "zero1", "unroll 16",
    zero2, "zero2", "unroll 16 preallocate",
    zero3, "zero3", "unroll 32",
    zero4, "zero4", "unroll 32 preallocate",
    zero5, "zero5", "unroll 64",
    zero6, "zero6", "unroll 64 preallocate",
    zero7, "zero7", "fstl",
    zero8, "zero8", "movl",
    zero9, "zero9", "unroll 8",
    zeroA, "zeroA", "generic_bzero",
    zeroB, "zeroB", "i486_bzero",
    zeroC, "zeroC", "i586_bzero",
    zeroD, "zeroD", "i686_pagezero",
    bzero, "zeroE", "bzero (stosl)",
};
#define NFUNC   (sizeof funcs / sizeof funcs[0])

int main(int argc, char **argv)
{
    unsigned char *buf;
    int ch;
    int funcn;
    int funcnspecified;
    int i586;
    size_t len;
    size_t max;
    int precache;
    int quiet;
    size_t thrashbufsize;
    unsigned long long tot;

    progname = argv[0];
    funcnspecified = -1;
    i586 = 0;
    len = 4096;
    precache = 0;
    quiet = 0;
    tot = 100000000;
    while ((ch = getopt(argc, argv, "5f:l:pqt:")) != EOF)
    {
        switch (ch)
        {
        case '5':
            i586 = 1;
            break;
        case 'f':
            funcnspecified = strtoul(optarg, (char **) NULL, 0);
            if (funcnspecified < 0 || funcnspecified >= NFUNC)
                usage();
            break;
        case 'l':
            len = strtoul(optarg, (char **) NULL, 0);
            break;
        case 'p':
            precache = 1;
            break;
        case 'q':
            quiet = 1;
            break;
        case 't':
            tot = strtouq(optarg, (char **) NULL, 0);
            break;
        default:
            usage();
        }
    }
    if (optind != argc)
        usage();
    buf = malloc(len + 4096);
    if (buf == NULL)
    {
        fprintf(stderr, "%s: malloc failed\n", progname);
        exit(1);
    }
    max = tot / len;
    tot = (unsigned long long) max * len;

    for (funcn = 0; funcn < NFUNC; ++funcn)
    {
        func_t *funcp;
        struct rusage finish;
        size_t i;
        struct rusage start;
        unsigned long long tsc;
        long usec;

        if (funcnspecified != -1 && funcnspecified != funcn)
            continue;

        /*
         * Check the function.  As side effects, make sure that the buffer
         * isn't a constant zero page, and leave as much of the buffer as
         * possible in the cache to set up the `precache' case.
         */
        memset(buf, 1, len);
        funcp = funcs[funcn].fn;
        funcp(buf, len);
#if 1
        for (i = 0; i < len; ++i)
            if (buf[i] != '\0')
            {
                fprintf(stderr, "%s: %s failed at %u\n",
                        progname, funcs[funcn].name, i);
                exit(1);
            }
#endif

        if (!precache)
            /*
             * Attempt to uncache the buffer so as to provide the same
             * uncached environnment for all the functions.
             */
            for (thrashbufsize = 2 * 1024 * 1024; thrashbufsize != 0;
                 thrashbufsize /= 2)
            {
                unsigned char *thrashbuf1;
                unsigned char *thrashbuf2;

                thrashbuf1 = malloc(thrashbufsize);
                thrashbuf2 = malloc(thrashbufsize);
                if (thrashbuf1 != NULL && thrashbuf2 != NULL)
                {
                    memcpy(thrashbuf2, thrashbuf1, thrashbufsize);
                    memcpy(thrashbuf1, thrashbuf2, thrashbufsize);
                }
                free(thrashbuf1);
                free(thrashbuf2);
            }

        tsc = 0;
        getrusage(RUSAGE_SELF, &start);
        if (i586)
            tsc = rdtsc();
#if 1
        for (i = 0; i < max; ++i)
            funcp(buf, len);
#else
        tot /= 8 * 8;
        tot *= 8 * 8;
        for (i = 0; i < max / 8 / 8; ++i)
        {
            int j, k;

            for (j = 0; j < 8; ++j)
                for (k = 0; k < 8; ++k)
                    funcp(buf + j, len + k);
        }
#endif
        if (i586)
            tsc = rdtsc() - tsc;
        getrusage(RUSAGE_SELF, &finish);
        usec = 1000000 * (finish.ru_utime.tv_sec - start.ru_utime.tv_sec)
               + finish.ru_utime.tv_usec - start.ru_utime.tv_usec;
        if (usec < 0)
            usec = 1;
        printf("%s: %10.0f B/s", funcs[funcn].name, tot * 1e6 / usec);
        if (!quiet)
        {
            printf(" (%7ld us)", usec);
            if (i586)
                printf(" (%9qd tsc)", tsc);
            printf(" (%s)", funcs[funcn].description);
        }
        printf("\n");
    }
    return 0;
}

static void zero0(void *buf, size_t len)
{
    asm volatile("
        .p2align 4,0x90
        cld
        shrl $2,%1
        rep; stosl"
        : "=D" (buf), "=c" (len)
        : "0"  (buf), "1"  (len), "a" (0)
        : "memory");
}

static void zero1(void *buf, size_t len)
{
    asm volatile("
        .p2align 4,0x90
        1:
        movl %4,0(%0)
        movl %4,4(%0)
        movl %4,8(%0)
        movl %4,12(%0)
        addl $16,%0
        subl $16,%1
        ja 1b
        "
        : "=r" (buf), "=r" (len)
        : "0"  (buf), "1"  (len), "r" (0)
        : "memory");
}

static void zero2(void *buf, size_t len)
{
    unsigned preallocate;

    asm volatile("
        .p2align 4,0x90
        1:
        movl (%0),%2
        movl %5,0(%0)
        movl %5,4(%0)
        movl %5,8(%0)
        movl %5,12(%0)
        addl $16,%0
        subl $16,%1
        ja 1b
        "
        : "=r" (buf), "=r" (len), "=&r" (preallocate)
        : "0"  (buf), "1"  (len), "r" (0)
        : "memory");
}

static void zero3(void *buf, size_t len)
{
    asm volatile("
        .p2align 4,0x90
        1:
        movl %4,0(%0)
        movl %4,4(%0)
        movl %4,8(%0)
        movl %4,12(%0)
        movl %4,16(%0)
        movl %4,20(%0)
        movl %4,24(%0)
        movl %4,28(%0)
        addl $32,%0
        subl $32,%1
        ja 1b
        "
        : "=r" (buf), "=r" (len)
        : "0"  (buf), "1"  (len), "r" (0)
        : "memory");
}

static void zero4(void *buf, size_t len)
{
    unsigned preallocate;

    asm volatile("
        .p2align 4,0x90
        1:
        movl (%0),%2
        movl %5,0(%0)
        movl %5,4(%0)
        movl %5,8(%0)
        movl %5,12(%0)
        movl %5,16(%0)
        movl %5,20(%0)
        movl %5,24(%0)
        movl %5,28(%0)
        addl $32,%0
        subl $32,%1
        ja 1b
        "
        : "=r" (buf), "=r" (len), "=&r" (preallocate)
        : "0"  (buf), "1"  (len), "r" (0)
        : "memory");
}

static void zero5(void *buf, size_t len)
{
    asm volatile("
        .p2align 4,0x90
        1:
        movl %4,0(%0)
        movl %4,4(%0)
        movl %4,8(%0)
        movl %4,12(%0)
        movl %4,16(%0)
        movl %4,20(%0)
        movl %4,24(%0)
        movl %4,28(%0)
        movl %4,32(%0)
        movl %4,36(%0)
        movl %4,40(%0)
        movl %4,44(%0)
        movl %4,48(%0)
        movl %4,52(%0)
        movl %4,56(%0)
        movl %4,60(%0)
        addl $64,%0
        subl $64,%1
        ja 1b"
        : "=r" (buf), "=r" (len)
        : "0"  (buf), "1"  (len), "r" (0)
        : "memory");
}

static void zero6(void *buf, size_t len)
{
    void *buf2;
    unsigned preallocate;

    /*
     * The main loop has 11 pairs of i586 instructions with no AGI so that
     * it takes 11 cycles on i586's if all the data is in the L1 cache.
     *
     * On an ASUS P55TP4XE P133 the speeds are approx:
     *    data in L1 cache:      740,000,000 B/s
     *    data in L2 cache only:  90,000,000 B/s (highly variant)
     *    data not in any cache:  60,000,000 B/s
     * and without preallocating (function zero5) they are:
     *    data in L1 cache:       87,000,000 B/s
     *    data in L2 cache only:  87,000,000 B/s
     *    data not in any cache:  90,000,000 B/s
     *
     * Thus the instruction selection and ordering optimizations have an
     * insignificant effect if the data isn't in the L1 cache or the L2
     * cache, and preallocating is a pessimization if the data isn't in the
     * L2 cache.
     */
    asm volatile("
        .p2align 4,0x90
        1:
        movl (%0),%3
        leal 32(%0),%2
        movl %6,0(%0)
        movl %6,4(%0)
        movl %6,8(%0)
        movl %6,12(%0)
        movl %6,16(%0)
        movl %6,20(%0)
        movl %6,24(%0)
        movl %6,28(%0)
        movl (%2),%3
        addl $64,%0
        movl %6,0(%2)
        movl %6,4(%2)
        movl %6,8(%2)
        movl %6,12(%2)
        movl %6,16(%2)
        movl %6,20(%2)
        movl %6,24(%2)
        movl %6,28(%2)
        subl $64,%1
        ja 1b
        "
        : "=r" (buf), "=r" (len), "=&r" (buf2), "=&r" (preallocate)
        : "0"  (buf), "1"  (len), "r" (0)
        : "memory");
}

static void zero7(void *buf, size_t len)
{
    /*
     * On a P55TP4XE P133, `fstl' goes slower than all the loop control
     * instructions put together, so unrolling would be bad.
     */
    asm volatile("
        fldz
        .p2align 4,0x90
        1:
        fstl 0(%0)
        addl $8,%0
        subl $8,%1
        ja 1b
        fstp %%st(0)"
        : "=r" (buf), "=r" (len)
        : "0"  (buf), "1"  (len)
        : "memory");
}

static void zero8(void *buf, size_t len)
{
    asm volatile("
        .p2align 4,0x90
        1:
        movl $0,0(%0)
        addl $4,%0
        subl $4,%1
        ja 1b
        "
        : "=r" (buf), "=r" (len)
        : "0"  (buf), "1"  (len), "r" (0)
        : "memory");
}

static void zero9(void *buf, size_t len)
{
    asm volatile("
        .p2align 4,0x90
        1:
        movl $0,0(%0)
        movl $0,4(%0)
        addl $8,%0
        subl $8,%1
        ja 1b
        "
        : "=r" (buf), "=r" (len)
        : "0"  (buf), "1"  (len), "r" (0)
        : "memory");
}

asm("
        .p2align        2,0x90
_zeroA:
        pushl   %edi
        movl    8(%esp),%edi
        movl    12(%esp),%ecx
        xorl    %eax,%eax
        shrl    $2,%ecx
        cld
        rep
        stosl
        movl    12(%esp),%ecx
        andl    $3,%ecx
        rep
        stosb
        popl    %edi
        ret
");

asm("
        .p2align        2,0x90
_zeroB:
        movl    4(%esp),%edx
        movl    8(%esp),%ecx
        xorl    %eax,%eax
2:
        cmpl    $64,%ecx
        jb      3f
        movl    %eax,(%edx)
        movl    %eax,4(%edx)
        movl    %eax,8(%edx)
        movl    %eax,12(%edx)
        movl    %eax,16(%edx)
        movl    %eax,20(%edx)
        movl    %eax,24(%edx)
        movl    %eax,28(%edx)
        movl    %eax,32(%edx)
        movl    %eax,36(%edx)
        movl    %eax,40(%edx)
        movl    %eax,44(%edx)
        movl    %eax,48(%edx)
        movl    %eax,52(%edx)
        movl    %eax,56(%edx)
        movl    %eax,60(%edx)
        addl    $64,%edx
        subl    $64,%ecx
        jnz     2b
        ret

        .p2align        4,0x90
3:
        cmpl    $16,%ecx
        jb      4f
        movl    %eax,(%edx)
        movl    %eax,4(%edx)
        movl    %eax,8(%edx)
        movl    %eax,12(%edx)
        addl    $16,%edx
        subl    $16,%ecx
        jnz     3b
        ret

        .p2align        4,0x90
4:
        cmpl    $4,%ecx
        jb      5f
        movl    %eax,(%edx)
        addl    $4,%edx
        subl    $4,%ecx
        jnz     4b
        ret

        .data
jtab:
        .long   do0
        .long   do1
        .long   do2
        .long   do3

        .text
        .p2align        4,0x90
5:
        jmp     jtab(,%ecx,4)

        .p2align        4,0x90
do3:
        movw    %ax,(%edx)
        movb    %al,2(%edx)
        ret

        .p2align        4,0x90
do2:
        movw    %ax,(%edx)
        ret

        .p2align        4,0x90
do1:
        movb    %al,(%edx)
        ret

        .p2align        4,0x90
do0:
        ret
");

int npxproc = 0;
int kernel_fpu_lock = 0xfe;

asm("
_zeroC:
        movl    4(%esp),%edx
        movl    8(%esp),%ecx

        cmpl    $176,%ecx               # 112 in kernel; 104-136 without hair
        jb      intreg_i586_bzero

        cmpl    $0,_npxproc
        je      i586_bz1
        cmpl    $176+184,%ecx           # 112+184 in kernel; 320 without hair
        jb      intreg_i586_bzero
        sarb    $1,_kernel_fpu_lock
        jc      intreg_i586_bzero
        smsw    %ax
#       clts
nop; nop
nop; nop
nop; nop
nop; nop
nop; nop
nop; nop
nop; nop
nop; nop
nop; nop
nop; nop
nop; nop
        subl    $108,%esp
        fnsave  0(%esp)
        jmp     i586_bz2

i586_bz1:
        sarb    $1,_kernel_fpu_lock
        jc      intreg_i586_bzero
        smsw    %ax
#       clts
nop; nop
nop; nop
nop; nop
nop; nop
nop; nop
nop; nop
nop; nop
nop; nop
nop; nop
nop; nop
nop; nop
        fninit
i586_bz2:
        fldz

        fstl    0(%edx)
        addl    %edx,%ecx
        addl    $8,%edx
        andl    $~7,%edx
        subl    %edx,%ecx

        fstl    -8(%edx,%ecx)
        decl    %ecx
        andl    $~7,%ecx

fpureg_i586_bzero_loop:
        fstl    0(%edx)
        addl    $8,%edx
        subl    $8,%ecx
        cmpl    $8,%ecx
        jae     fpureg_i586_bzero_loop

        cmpl    $0,_npxproc
        je      i586_bz3
        frstor  0(%esp)
        addl    $108,%esp
#       lmsw    %ax
nop; nop
nop; nop
nop; nop
nop; nop
nop; nop
nop; nop
nop; nop
nop; nop
nop; nop
nop; nop
nop; nop
nop; nop
nop; nop
nop; nop
nop; nop
nop; nop
nop; nop
nop; nop
        movb    $0xfe,_kernel_fpu_lock
        ret

i586_bz3:
        fstpl   %st(0)
#       lmsw    %ax
nop; nop
nop; nop
nop; nop
nop; nop
nop; nop
nop; nop
nop; nop
nop; nop
nop; nop
nop; nop
nop; nop
nop; nop
nop; nop
nop; nop
nop; nop
nop; nop
nop; nop
nop; nop
        movb    $0xfe,_kernel_fpu_lock
        ret

intreg_i586_bzero:
        cmpl    $8,%ecx
        jb      i586_bz4
        movl    $0,(%edx)
        movl    $0,4(%edx)
        addl    %edx,%ecx
        addl    $8,%edx
        andl    $~7,%edx
        subl    %edx,%ecx
        cmpl    $8,%ecx
        jb      i586_bz4
intreg_i586_bzero_loop:
        movl    $0,0(%edx)
        movl    $0,4(%edx)
        addl    $8,%edx
        subl    $8,%ecx
        cmpl    $8,%ecx
        jae     intreg_i586_bzero_loop

        nop

i586_bz4:
        cmpl    $4,%ecx
        jb      i586_bz5
        movl    $0,(%edx)
        addl    $4,%edx
        subl    $4,%ecx
i586_bz5:
        cmpl    $2,%ecx
        jb      i586_bz6
        movw    $0,(%edx)
        addl    $2,%edx
        subl    $2,%ecx
i586_bz6:
        cmpl    $1,%ecx
        jb      i586_bz7
        movb    $0,(%edx)
i586_bz7:
        ret
");

asm("
_zeroD1:
        pushl   %edi

        movl    8(%esp), %edi
        movl    12(%esp), %ecx
        shrl    $2, %ecx
        cld

        .p2align 2,0x90
1:
        jmp     2f
        movl    (%edi), %eax
        orl     4(%edi), %eax
        orl     8(%edi), %eax
        orl     12(%edi), %eax
        orl     16(%edi), %eax
        orl     20(%edi), %eax
        orl     24(%edi), %eax
        orl     28(%edi), %eax
        jne     2f

        addl    $32, %edi
        subl    $32/4, %ecx
        jne     1b

        popl    %edi
        ret

        .p2align 2,0x90
3:
        leal    -32/4(%ecx), %edx
        xorl    %eax, %eax
        movl    $32/4, %ecx

        rep
        stosl

        addl    %edx, %ecx
        jne     1b

        popl    %edi
        ret

        .p2align 2,0x90
2:
        movl    $0, (%edi)
        movl    $0, 4(%edi)
        movl    $0, 8(%edi)
        movl    $0, 12(%edi)
        movl    $0, 16(%edi)
        movl    $0, 20(%edi)
        movl    $0, 24(%edi)
        movl    $0, 28(%edi)

        addl    $32, %edi
        subl    $32/4, %ecx
        jne     1b

        popl    %edi
        ret
");

asm("
_zeroD:
        movl    4(%esp), %edx
        movl    8(%esp), %ecx
        shrl    $2, %ecx

        .p2align 2,0x90
1:
        movl    (%edx), %eax
        orl     4(%edx), %eax
        orl     8(%edx), %eax
        orl     12(%edx), %eax
        orl     16(%edx), %eax
        orl     20(%edx), %eax
        orl     24(%edx), %eax
        orl     28(%edx), %eax
        jne     2f

        addl    $32, %edx
        subl    $32/4, %ecx
        jne     1b

        ret

        .p2align 2,0x90
2:
        movl    $0, (%edx)
        movl    $0, 4(%edx)
        movl    $0, 8(%edx)
        movl    $0, 12(%edx)
        movl    $0, 16(%edx)
        movl    $0, 20(%edx)
        movl    $0, 24(%edx)
        movl    $0, 28(%edx)

        addl    $32, %edx
        subl    $32/4, %ecx
        jne     1b

        ret
");

static void usage(void)
{
    fprintf(stderr, "%s: [-5cpq] [-f function] [-l length] [-t tot]\n",
            progname);
    exit(1);
}
---

Bruce


To Unsubscribe: send mail to [EMAIL PROTECTED]
with "unsubscribe freebsd-current" in the body of the message

Reply via email to