Hi, compile the following routine for the i386 processor, with optimisation:
--- double.c --- // IEEE-754 binary64 double-precision floating point // binary64 != ±0.0 -> 0 // binary64 == +0.0 -> +1 // binary64 == -0.0 -> -1 int plusminus0(unsigned long long binary64) { if (binary64 != -binary64) // neither +0.0 nor -0.0 return 0; if (binary64 == 0) return 1; return -1; } --- EOF --- GCC 12.2 gcc -m32 -O2 double.c # https://godbolt.org/z/17as1M1xM plusminus0(unsigned long long): push esi push ebx mov ecx, DWORD PTR [esp+12] mov ebx, DWORD PTR [esp+16] mov eax, ecx neg eax mov edx, ebx adc edx, 0 xor eax, ecx neg edx xor edx, ebx or eax, edx jne .L5 or ecx, ebx pop ebx cmp ecx, 1 sbb esi, esi and esi, 2 sub esi, 1 mov eax, esi pop esi ret .L5: xor esi, esi pop ebx mov eax, esi pop esi ret OUCH: these 27 instructions in 56 bytes are as BAD^WHORRIBLE as code could get! EVERY optimising^Wcompiler writer should be aware that if (binary64 == -binary64) is just a shorthand for if (binary64 == 0 - binary64) and thus equivalent to if (binary64 + binary64 == 0) which SHOULD lead to the following (optionally branch-free) code: mov ecx, dword ptr [esp+4] mov edx, dword ptr [esp+8] # edx:ecx = binary64 add ecx, ecx adc edx, edx sbb eax, eax # eax = (binary64 < 0) ? -1 : 0 .ifnotdef BRANCHFREE or ecx, edx jz .L0 # binary64 == -binary64? stc # CF = 1 adc eax, eax # eax = (binary64 < 0) ? -1 : 1 .L0: .else stc # CF = 1 adc eax, eax # eax = (binary64 < 0) ? -1 : 1 or ecx, edx # ecx = (binary64 == -binary64) ? 0 : * neg ecx # CF = (binary64 != -binary64) sbb ecx, ecx # ecx = (binary64 != -binary64) ? -1 : 0 not ecx # ecx = (binary64 == -binary64) ? -1 : 0 and eax, ecx .endif ret Either 10 instructions in 22 bytes or 13 instructions in 28 bytes, i.e. less than half the instructions and bytes! Since the lower half of the binary64 only needs to be tested against 0, a TRUE optimising compiler would but come up with the following code: mov eax, dword ptr [esp+8] # upper half of binary64 cdq # edx = (binary64 < 0) ? -1 : 0 stc # CF = 1 adc edx, edx # edx = (binary64 < 0) ? -1 : 1 add eax, eax or eax, dword ptr [esp+4] neg eax # CF = (binary64 != -binary64) sbb eax, eax # eax = (binary64 != -binary64) ? -1 : 0 not eax # eax = (binary64 == -binary64) ? -1 : 0 and eax, edx ret 11 instructions in 23 bytes. --- single.c --- // IEEE-754 binary32 single-precision floating point int plusminus0(unsigned long binary32) { if (binary32 != -binary32) // neither +0.0 nor -0.0 return 0; if (binary32 == 0) return 1; return -1; } --- EOF --- GCC 12.2 gcc -m32 -O2 single.c # https://godbolt.org/z/djT748e81 plusminus0(unsigned int): mov edx, DWORD PTR [esp+4] xor eax, eax mov ecx, edx neg ecx cmp ecx, edx jne .L1 cmp ecx, 1 sbb eax, eax and eax, 2 sub eax, 1 .L1: ret OOPS (11 instructions in 26 bytes)! An optimising compiler SHOULD but generate 8 instructions in 16 bytes: xor eax, eax mov ecx, DWORD PTR [esp+4] add ecx, ecx jnz .L1 # binary32 != -binary32? sbb eax, eax # eax = (binary32 < 0) ? -1 : 0 stc # CF = 1 adc eax, eax # eax = (binary32 < 0) ? -1 : 1 .L1: ret A TRUE optimising compiler would butgenerate the following branch-free code, using 7 or 8 instructions in 19 or 18 bytes: .if 0 mov eax, DWORD PTR [esp+4] neg eax # OF = (binary32 == -0.0), # ZF = (binary32 == +0.0) .else xor eax, eax sub eax, DWORD PTR [esp+4] .endif seto ah setz al sub al, ah # al = ZF - OF .if 0 cbw cwde .else movsx eax, al .endif ret Stefan Kanthak