Hello everybody,
(I'm not a member, CC me in replies)
I'm trying to force gcc to trust me that my memory allocation function
is returning aligned memory. So far I tried everything I found with no
luck. What I have is:
#define MP_ALIGN (2*sizeof(size_t)<__alignof__(long
double)?__alignof__(long double):2*sizeof(size_t))
#define MP_ALIGN_MINUS (MP_ALIGN-1)
typedef void alignedvoid __attribute__ ((__aligned__(MP_ALIGN)));
#define mp_malloc(pool, size) _mp_malloc(pool,
(((size)+MP_ALIGN_MINUS)/MP_ALIGN)*MP_ALIGN)
#define mp_calloc(pool, size) ({\
size_t asize__=(((size)+MP_ALIGN_MINUS)/MP_ALIGN)*MP_ALIGN;\
alignedvoid *aptr__=_mp_malloc(pool, asize__);\
memset(aptr__, 0, asize__);\
aptr__;\
})
alignedvoid *_mp_malloc(memorypool *pool, size_t size)
__attribute__((malloc));
What I expect is that after those declarations, code like:
fldt=(filedata *)mp_calloc(http->conn->pool, sizeof(filedata));
to produce assembly code that calls _mp_malloc and afterwards assumes
that the returned value is properly aligned for fast store and hopefully
makes use of the fact that size is also rounded properly so no code for
filling the trailing odd bytes is generated.
Neither of those happens with "gcc version 4.7.3 (Ubuntu/Linaro
4.7.3-1ubuntu1)", the produced assembly is:
406eeb: e8 30 fa ff ff callq 406920 <_mp_malloc>
406ef0: a8 01 test $0x1,%al
406ef2: 48 89 44 24 58 mov %rax,0x58(%rsp)
406ef7: 48 89 c7 mov %rax,%rdi
406efa: ba 70 00 00 00 mov $0x70,%edx
406eff: 0f 85 89 0b 00 00 jne 407a8e <serve_file+0xbde>
406f05: 40 f6 c7 02 test $0x2,%dil
406f09: 0f 85 92 0b 00 00 jne 407aa1 <serve_file+0xbf1>
406f0f: 40 f6 c7 04 test $0x4,%dil
406f13: 0f 85 63 0b 00 00 jne 407a7c <serve_file+0xbcc>
406f19: 89 d1 mov %edx,%ecx
406f1b: 31 c0 xor %eax,%eax
406f1d: c1 e9 03 shr $0x3,%ecx
406f20: f6 c2 04 test $0x4,%dl
406f23: f3 48 ab rep stos %rax,%es:(%rdi)
406f26: 74 0a je 406f32 <serve_file+0x82>
406f28: c7 07 00 00 00 00 movl $0x0,(%rdi)
406f2e: 48 83 c7 04 add $0x4,%rdi
406f32: f6 c2 02 test $0x2,%dl
406f35: 74 09 je 406f40 <serve_file+0x90>
406f37: 66 c7 07 00 00 movw $0x0,(%rdi)
406f3c: 48 83 c7 02 add $0x2,%rdi
406f40: 83 e2 01 and $0x1,%edx
406f43: 74 03 je 406f48 <serve_file+0x98>
406f45: c6 07 00 movb $0x0,(%rdi)
As you can see 3 unneeded conditionals preceed the "rep stos
%rax,%es:(%rdi)" and also 3 conditionals go after the filling
instruction. Neither of these is necessary if the compiler take hint
from either
__attribute__((malloc)) or __attribute__ ((__aligned__(MP_ALIGN))) and
the fact that I'm rounding up the size before passing it to memset().
Alternative implementation of the mp_calloc macro works better for me (I
like the code better, no benchmarks performed):
#define mp_calloc(pool, size) ({\
size_t asize__=(((size)+MP_ALIGN_MINUS)/MP_ALIGN)*MP_ALIGN;\
alignedvoid *aptr__=_mp_malloc(pool, asize__);\
size_t i__;\
for (i__=0; i__<asize__/sizeof(size_t); i__++)\
((size_t *)aptr__)[i__]=0;\
aptr__;\
})
For this particular structure size (0x70) it just produces 14
instructions like with -O3:
movq $0x0,(%rax)
movq $0x0,0x8(%rax)
...
And with -O2:
405574: e8 77 f9 ff ff callq 404ef0 <_mp_malloc>
405579: 48 89 c3 mov %rax,%rbx
40557c: 31 c0 xor %eax,%eax
40557e: 66 90 xchg %ax,%ax
405580: 48 c7 04 c3 00 00 00 movq $0x0,(%rbx,%rax,8)
405587: 00
405588: 48 83 c0 01 add $0x1,%rax
40558c: 48 83 f8 0e cmp $0xe,%rax
405590: 75 ee jne 405580 <serve_file+0x40>
So the question is: is there a proper way to force the compiler to
generate only the "rep stos %rax,%es:(%rdi)" code for memset without the
checks for alignment?
Best,
Anton Titov