On 2013-04-02 07:41, Alexander Graf wrote:
On 2013-04-01 23:34, Alexander Graf wrote:
Is this faster than a load/store with std/ldbrx?

Hmm.  Almost certainly not.  And since we've got stack space
allocated for function calls, we've got scratch space to do it in.

Probably similar for bswap32 too, eh?

Depends - memory load/store doesn't come for free and bswap32 is quite short.


I'll do a tiny bit o benchmarking for power7.

Cool, thanks a bunch :)

Heh.  "Almost certainly not" indeed.  Unless I've made some silly mistake,
going through memory stalls badly.  No store buffer forwarding on power7?

With the following test case, time reports:

f1              2.967s
f2              8.930s
f3              7.071s
f4              7.166s

And note that f4 is a normal store/load pair, trying to determine what the
store buffer forwarding delay might be.


r~
static long __attribute__((noinline)) f1(long x, long *mem)
{
  long r, t;
  asm volatile (
       "rlwinm %0,%1,8,0,31\n\
	rlwimi %0,%1,24,0,7\n\
	rlwimi %0,%1,24,16,23\n\
	rldicl %0,%0,32,0\n\
	rldicl %2,%1,32,0\n\
	rlwimi %0,%2,8,0,31\n\
	rlwimi %0,%2,24,0,7\n\
	rlwimi %0,%2,24,16,23"
	: "=&r"(r), "=r"(t)
	: "r"(x));
  return r;
}

static long __attribute__((noinline)) f2(long x, long *mem)
{
  long r, t;
  asm volatile ("std %1,0(%2); ldbrx %0,0,%2" : "=r"(r) : "r"(x), "b"(mem));
  return r;
}

static long __attribute__((noinline)) f3(long x, long *mem)
{
  long r, t;
  asm volatile ("stdbrx %1,0,%2; ld %0,0(%2)" : "=r"(r) : "r"(x), "b"(mem));
  return r;
}

static long __attribute__((noinline)) f4(long x, long *mem)
{
  long r, t;
  asm volatile ("std %1,0(%2); ld %0,0(%2)" : "=r"(r) : "r"(x), "b"(mem));
  return r;
}

#define D1(x,y) x##y
#define DO(x)   D1(f,x)

int main()
{
    long tmp, i;
    for (i = 0; i < 1000000000; ++i)
      DO(N)(i, &tmp);
    return 0;
}

Reply via email to