On 2013-04-02 07:41, Alexander Graf wrote:
On 2013-04-01 23:34, Alexander Graf wrote:
Is this faster than a load/store with std/ldbrx?
Hmm. Almost certainly not. And since we've got stack space
allocated for function calls, we've got scratch space to do it in.
Probably similar for bswap32 too, eh?
Depends - memory load/store doesn't come for free and bswap32 is quite short.
I'll do a tiny bit o benchmarking for power7.
Cool, thanks a bunch :)
Heh. "Almost certainly not" indeed. Unless I've made some silly mistake,
going through memory stalls badly. No store buffer forwarding on power7?
With the following test case, time reports:
f1 2.967s
f2 8.930s
f3 7.071s
f4 7.166s
And note that f4 is a normal store/load pair, trying to determine what the
store buffer forwarding delay might be.
r~
static long __attribute__((noinline)) f1(long x, long *mem)
{
long r, t;
asm volatile (
"rlwinm %0,%1,8,0,31\n\
rlwimi %0,%1,24,0,7\n\
rlwimi %0,%1,24,16,23\n\
rldicl %0,%0,32,0\n\
rldicl %2,%1,32,0\n\
rlwimi %0,%2,8,0,31\n\
rlwimi %0,%2,24,0,7\n\
rlwimi %0,%2,24,16,23"
: "=&r"(r), "=r"(t)
: "r"(x));
return r;
}
static long __attribute__((noinline)) f2(long x, long *mem)
{
long r, t;
asm volatile ("std %1,0(%2); ldbrx %0,0,%2" : "=r"(r) : "r"(x), "b"(mem));
return r;
}
static long __attribute__((noinline)) f3(long x, long *mem)
{
long r, t;
asm volatile ("stdbrx %1,0,%2; ld %0,0(%2)" : "=r"(r) : "r"(x), "b"(mem));
return r;
}
static long __attribute__((noinline)) f4(long x, long *mem)
{
long r, t;
asm volatile ("std %1,0(%2); ld %0,0(%2)" : "=r"(r) : "r"(x), "b"(mem));
return r;
}
#define D1(x,y) x##y
#define DO(x) D1(f,x)
int main()
{
long tmp, i;
for (i = 0; i < 1000000000; ++i)
DO(N)(i, &tmp);
return 0;
}