On 08/15/2016 05:46 AM, Florian Weimer wrote:
On 08/14/2016 08:23 AM, Daniel Santos wrote:
ms_abi_push_regs:
pop %rax
push %rdi
push %rsi
sub $0xa8,%rsp
movaps %xmm6,(%rsp)
movaps %xmm7,0x10(%rsp)
movaps %xmm8,0x20(%rsp)
movaps %xmm9,0x30(%rsp)
movaps %xmm10,0x40(%rsp)
movaps %xmm11,0x50(%rsp)
movaps %xmm12,0x60(%rsp)
movaps %xmm13,0x70(%rsp)
movaps %xmm14,0x80(%rsp)
movaps %xmm15,0x90(%rsp)
jmp *(%rax)
I think this will be quite slow because it breaks the return stack
optimization in the CPU. I think you should push the return address
and use RET.
Florian
Looks like I forgot to reply-all on my last reply, but thanks again for
the advice here. Would there be any performance hit to reshuffling the
push/pops to save the 8 byte alignment padding? My assumption is that
the stack will always be 16-byte aligned with the 8-byte return address
of the last call on it, so offset by 8 bytes. (Also, not sure that I
need the .type directive, was copying other code in libgcc :)
.text
.global __msabi_save
.hidden __msabi_save
#ifdef __ELF__
.type __msabi_save,@function
#endif
/* TODO: implement vmovaps when supported?*/
__msabi_save:
#ifdef __x86_64__
pop %rax
push %rdi
sub $0xa0,%rsp
movaps %xmm6,(%rsp)
movaps %xmm7,0x10(%rsp)
movaps %xmm8,0x20(%rsp)
movaps %xmm9,0x30(%rsp)
movaps %xmm10,0x40(%rsp)
movaps %xmm11,0x50(%rsp)
movaps %xmm12,0x60(%rsp)
movaps %xmm13,0x70(%rsp)
movaps %xmm14,0x80(%rsp)
movaps %xmm15,0x90(%rsp)
push %rsi
push %rax
#endif /* __x86_64__ */
ret
.text
.global __msabi_restore
.hidden __msabi_restore
#ifdef __ELF__
.type __msabi_restore,@function
#endif
__msabi_restore:
#ifdef __x86_64__
pop %rsi
movaps (%rsp),%xmm6
movaps 0x10(%rsp),%xmm7
movaps 0x20(%rsp),%xmm8
movaps 0x30(%rsp),%xmm9
movaps 0x40(%rsp),%xmm10
movaps 0x50(%rsp),%xmm11
movaps 0x60(%rsp),%xmm12
movaps 0x70(%rsp),%xmm13
movaps 0x80(%rsp),%xmm14
movaps 0x90(%rsp),%xmm15
add $0xa0,%rsp
pop %rdi
#endif /* __x86_64__ */
ret
Thanks!
Daniel