e6500: hw tablewalk: optimize a bit for tcd lock acquiring codes

Kevin Hao Mon, 17 Aug 2015 04:18:07 -0700

On Fri, Aug 14, 2015 at 09:44:28PM -0500, Scott Wood wrote:
> I tried a couple different benchmarks and didn't find a significant 
> difference, relative to the variability of the results running on the same 
> kernel.  A patch that claims to "optimize a bit" as its main purpose ought to 
> show some results. :-)


I tried to compare the execution time of these two code sequences with the
following test module:

#include <linux/module.h>
#include <linux/kernel.h>
#include <linux/printk.h>

static void test1(void)
{
        int i;
        unsigned char lock, c;
        unsigned short cpu, s;

        for (i = 0; i < 100000; i++) {
                lock = 0;
                cpu = 1;

                asm volatile (  
"1:             lbarx   %0,0,%2\n\
                lhz     %1,0(%3)\n\
                cmpdi   %0,0\n\
                cmpdi   cr1,%1,1\n\
                addi    %1,%1,1\n\
                bne     2f\n\
                stbcx.  %1,0,%2\n\
                bne     1b\n\
2:"
                : "=&r" (c), "=&r" (s) : "r" (&lock), "r" (&cpu) : "cr0", 
"cr1", "memory"); 
        }
}

static void test2(void)
{
        int i;
        unsigned char lock, c;
        unsigned short cpu, s;

        for (i = 0; i < 100000; i++) {
                lock = 0;
                cpu = 1;

                asm volatile (  
"               lhz     %1,0(%3)\n\
                addi    %1,%1,1\n\
                crclr   cr1*4+eq\n\
1:              lbarx   %0,0,%2\n\
                cmpdi   %0,0\n\
                bne     2f\n\
                stbcx.  %1,0,%2\n\
                bne     1b\n\
2:"
                : "=&r" (c), "=&r" (s) : "r" (&lock), "r" (&cpu) : "cr0", 
"cr1", "memory"); 
        }
}

static int test_init(void)
{
        unsigned long s, e, tm1, tm2;

        __hard_irq_disable();
        /* Just for prefetch */
        test1();
        s = mftb();
        test1();
        e = mftb();
        tm1 = e - s;

        /* Just for prefetch */
        test2();
        s = mftb();
        test2();
        e = mftb();
        tm2 = e - s;
        __hard_irq_enable();

        pr_err("test1: %ld, test2: %ld, %%%ld\n", tm1, tm2, (tm1 - tm2) * 100 / 
tm1);

        return 0;
}

static void test_exit(void)
{
        return;
}

module_init(test_init);
module_exit(test_exit);
MODULE_LICENSE("GPL");

The results:
test1: 156568, test2: 151675, %3
test1: 156604, test2: 151670, %3
test1: 156567, test2: 151684, %3
test1: 156567, test2: 151678, %3
test1: 156567, test2: 151688, %3
test1: 156570, test2: 151683, %3
test1: 156565, test2: 151675, %3
test1: 156565, test2: 151673, %3

It seems that there do have a %3 gain in performance by moving the
3 instructions out of lbarx/stbcx loop.

Thanks,
Kevin

pgpS6_d8tw8Wv.pgp
Description: PGP signature

_______________________________________________
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [PATCH 2/3] powerpc/e6500: hw tablewalk: optimize a bit for tcd lock acquiring codes

Reply via email to