Superfluous branches due to insufficient flow analysis

Stefan Kanthak Fri, 13 Aug 2021 11:59:10 -0700

Hi,

compile the following naive implementation of nextafter() for AMD64:


JFTR: ignore the aliasing casts, they don't matter here!

$ cat repro.c
double nextafter(double from, double to)
{
    if (to != to)
        return to;        // to is NAN

    if (from != from)
        return from;      // from is NAN

    if (from == to)       // neither from nor to can be NAN here!
        return to;

    if (from == 0.0)      // dito!
        return to < 0.0 ? -0x1.0p-1074 : 0x1.0p-1074;

    unsigned long long ull = *(unsigned long long *) &from;

    if ((from < to) == (from < 0.0))
        ull--;
    else
        ull++;

    return *(double *) &ull;
}
$ gcc -m64 -o- -O3 -S repro.c
...
nextafter:
        ucomisd %xmm1, %xmm1    // sets PF for unordered result, i.e. when at
        jp      .L10            //  least one operand is NAN
        ucomisd %xmm0, %xmm0    // same here
        jp      .L1
        ucomisd %xmm0, %xmm1
        jnp     .L14            // OUCH: PF can't be set here!
                                // OUCH: and if it were, it's MORE LIKELY to be
                                //        clear, so this branch would be taken
                                //         ... against the branch prediction 
.L11:
        pxor    %xmm2, %xmm2    // OUCH: switching from FP SSE to integer SSE 
and
                                //        vice versa incurs a penalty of 1 cycle
                                //         on quite a lot Intel Core processors!
                                // Better use XORPD instead (which is even 1 
byte
                                //  shorter)!
        ucomisd %xmm2, %xmm0
        jnp     .L15            // OUCH: there's still no need to check PF 
here! 
.L4:
        comisd  %xmm0, %xmm1
        movq    %xmm0, %rdx
        leaq    -1(%rdx), %rax
        seta    %r8b
        comisd  %xmm0, %xmm2
        seta    %cl
        addq    $1, %rdx
        cmpb    %cl, %r8b
        cmovne  %rdx, %rax
        movq    %rax, %xmm0
.L1:
        ret
.L14:
        jne     .L11
.L10:
        movapd  %xmm1, %xmm0
        ret
.L15:
        jne     .L4
        movabsq $-9223372036854775808, %rdx
        movq    %xmm1, %rax
        andq    %rdx, %rax
        orq     $1, %rax
        movq    %rax, %xmm0
        ret


Stefan

Superfluous branches due to insufficient flow analysis

Reply via email to