https://gcc.gnu.org/bugzilla/show_bug.cgi?id=97387
--- Comment #14 from fdlbxtqi <euloanty at live dot com> --- (In reply to fdlbxtqi from comment #13) > https://godbolt.org/z/fqGrz1 > > After this patch, the assembly generated is much better now. However, it > still contains many optimization problems. > > The problem is the code like this. > > Let's just walk through the assembly and see the problems here. > > field_number operator-(field_number const& x,field_number const& y) noexcept > { > using namespace intrinsics; > using unsigned_type = field_number::value_type; > constexpr unsigned_type zero{}; > field_number f; > bool borrow{sub_borrow(false,x[0],y[0],f[0])}; > borrow=sub_borrow(borrow,x[1],y[1],f[1]); > borrow=sub_borrow(borrow,x[2],y[2],f[2]); > borrow=sub_borrow(borrow,x[3],y[3],f[3]); > unsigned_type v{}; > sub_borrow(borrow,v,v,v); > v&=static_cast<unsigned_type>(38); > borrow=sub_borrow(false,f[0],v,f[0]); > borrow=sub_borrow(borrow,f[1],zero,f[1]); > borrow=sub_borrow(borrow,f[2],zero,f[2]); > borrow=sub_borrow(borrow,f[3],zero,f[3]); > sub_borrow(borrow,v,v,v); > v&=static_cast<unsigned_type>(38); > borrow=sub_borrow(false,f[0],v,f[0]); > borrow=sub_borrow(borrow,f[1],zero,f[1]); > borrow=sub_borrow(borrow,f[2],zero,f[2]); > borrow=sub_borrow(borrow,f[2],zero,f[3]); > return f; > } > > > _ZN7fast_io10curve25519miERKNS0_12field_numberES3_: > .LFB5431: > .cfi_startproc > .cfi_personality 0x3,__gxx_personality_v0 > movq %rsi, %rcx > movq %rdx, %rax > movq %rdi, %r8 > movq (%rsi), %rdi > movq 24(%rcx), %r9 > subq (%rdx), %rdi > movq 8(%rsi), %rsi > sbbq 8(%rdx), %rsi > movq 16(%rcx), %rdx > sbbq 16(%rax), %rdx > movq 24(%rax), %rax > sbbq %rax, %r9 > > movl $0, %eax > movq %rax, %rcx > //The value of sbbq to itself does not matter I should be sbbq %r9 %r9 and > you are done. > > sbbq %rax, %rcx > andl $38, %ecx > subq %rcx, %rdi > > //The 2nd problems are these %rax stuffs. The should be just imm 0 > // sbbq $0,%rsi > sbbq %rax, %rsi > sbbq %rax, %rdx > sbbq %rax, %r9 > sbbq %rcx, %rcx > andl $38, %ecx > subq %rcx, %rdi > sbbq %rax, %rsi > movq %rdi, -40(%rsp) > sbbq %rax, %rdx > movq %rsi, -32(%rsp) > movdqa -40(%rsp), %xmm0 > movq %rdx, -24(%rsp) > sbbq %rax, %rdx > movq %r8, %rax > movq %rdx, -16(%rsp) > movdqa -24(%rsp), %xmm1 > movups %xmm0, (%r8) > movups %xmm1, 16(%r8) > ret > > https://godbolt.org/z/nKPWx3 I think the optimal instruction number should be 26.