Hi, compile the following naive implementation of nextafter() for AMD64:
JFTR: ignore the aliasing casts, they don't matter here! $ cat repro.c double nextafter(double from, double to) { if (to != to) return to; // to is NAN if (from != from) return from; // from is NAN if (from == to) // neither from nor to can be NAN here! return to; if (from == 0.0) // dito! return to < 0.0 ? -0x1.0p-1074 : 0x1.0p-1074; unsigned long long ull = *(unsigned long long *) &from; if ((from < to) == (from < 0.0)) ull--; else ull++; return *(double *) &ull; } $ gcc -m64 -o- -O3 -S repro.c ... nextafter: ucomisd %xmm1, %xmm1 // sets PF for unordered result, i.e. when at jp .L10 // least one operand is NAN ucomisd %xmm0, %xmm0 // same here jp .L1 ucomisd %xmm0, %xmm1 jnp .L14 // OUCH: PF can't be set here! // OUCH: and if it were, it's MORE LIKELY to be // clear, so this branch would be taken // ... against the branch prediction .L11: pxor %xmm2, %xmm2 // OUCH: switching from FP SSE to integer SSE and // vice versa incurs a penalty of 1 cycle // on quite a lot Intel Core processors! // Better use XORPD instead (which is even 1 byte // shorter)! ucomisd %xmm2, %xmm0 jnp .L15 // OUCH: there's still no need to check PF here! .L4: comisd %xmm0, %xmm1 movq %xmm0, %rdx leaq -1(%rdx), %rax seta %r8b comisd %xmm0, %xmm2 seta %cl addq $1, %rdx cmpb %cl, %r8b cmovne %rdx, %rax movq %rax, %xmm0 .L1: ret .L14: jne .L11 .L10: movapd %xmm1, %xmm0 ret .L15: jne .L4 movabsq $-9223372036854775808, %rdx movq %xmm1, %rax andq %rdx, %rax orq $1, %rax movq %rax, %xmm0 ret Stefan