EPIC optimiser failures (i386)

Stefan Kanthak Mon, 09 Jan 2023 03:51:27 -0800

Hi,

compile the following routine for the i386 processor,
with optimisation:


--- double.c  ---
// IEEE-754 binary64 double-precision floating point

// binary64 != ±0.0 ->  0
// binary64 == +0.0 -> +1
// binary64 == -0.0 -> -1

int plusminus0(unsigned long long binary64)
{
    if (binary64 != -binary64) // neither +0.0 nor -0.0
        return 0;
    if (binary64 == 0)
        return 1;
    return -1;
}
--- EOF ---

GCC 12.2    gcc -m32 -O2 double.c

# https://godbolt.org/z/17as1M1xM
plusminus0(unsigned long long):
        push    esi
        push    ebx
        mov     ecx, DWORD PTR [esp+12]
        mov     ebx, DWORD PTR [esp+16]
        mov     eax, ecx
        neg     eax
        mov     edx, ebx
        adc     edx, 0
        xor     eax, ecx
        neg     edx
        xor     edx, ebx
        or      eax, edx
        jne     .L5
        or      ecx, ebx
        pop     ebx
        cmp     ecx, 1
        sbb     esi, esi
        and     esi, 2
        sub     esi, 1
        mov     eax, esi
        pop     esi
        ret
.L5:
        xor     esi, esi
        pop     ebx
        mov     eax, esi
        pop     esi
        ret

OUCH: these 27 instructions in 56 bytes are as BAD^WHORRIBLE as code
      could get!

EVERY optimising^Wcompiler writer should be aware that

    if (binary64 == -binary64)

is just a shorthand for

    if (binary64 == 0 - binary64)

and thus equivalent to

    if (binary64 + binary64 == 0)

which SHOULD lead to the following (optionally branch-free) code:

        mov     ecx, dword ptr [esp+4]
        mov     edx, dword ptr [esp+8]  # edx:ecx = binary64
        add     ecx, ecx
        adc     edx, edx
        sbb     eax, eax                # eax = (binary64 < 0) ? -1 : 0
.ifnotdef BRANCHFREE
        or      ecx, edx
        jz      .L0                     # binary64 == -binary64?
        stc                             # CF = 1
        adc     eax, eax                # eax = (binary64 < 0) ? -1 : 1
.L0:
.else
        stc                             # CF = 1
        adc     eax, eax                # eax = (binary64 < 0) ? -1 : 1
        or      ecx, edx                # ecx = (binary64 == -binary64) ? 0 : *
        neg     ecx                     # CF = (binary64 != -binary64)
        sbb     ecx, ecx                # ecx = (binary64 != -binary64) ? -1 : 0
        not     ecx                     # ecx = (binary64 == -binary64) ? -1 : 0
        and     eax, ecx
.endif
        ret

Either 10 instructions in 22 bytes or 13 instructions in 28 bytes,
i.e. less than half the instructions and bytes!

Since the lower half of the binary64 only needs to be tested against 0,
a TRUE optimising compiler would but come up with the following code:

        mov     eax, dword ptr [esp+8]  # upper half of binary64
        cdq                             # edx = (binary64 < 0) ? -1 : 0
        stc                             # CF = 1
        adc     edx, edx                # edx = (binary64 < 0) ? -1 : 1
        add     eax, eax
        or      eax, dword ptr [esp+4]
        neg     eax                     # CF = (binary64 != -binary64)
        sbb     eax, eax                # eax = (binary64 != -binary64) ? -1 : 0
        not     eax                     # eax = (binary64 == -binary64) ? -1 : 0
        and     eax, edx
        ret

11 instructions in 23 bytes.


--- single.c  ---
// IEEE-754 binary32 single-precision floating point

int plusminus0(unsigned long binary32)
{
    if (binary32 != -binary32) // neither +0.0 nor -0.0
        return 0;
    if (binary32 == 0)
        return 1;
    return -1;
}
--- EOF ---

GCC 12.2    gcc -m32 -O2 single.c

# https://godbolt.org/z/djT748e81
plusminus0(unsigned int):
        mov     edx, DWORD PTR [esp+4]
        xor     eax, eax
        mov     ecx, edx
        neg     ecx
        cmp     ecx, edx
        jne     .L1
        cmp     ecx, 1
        sbb     eax, eax
        and     eax, 2
        sub     eax, 1
.L1:
        ret

OOPS (11 instructions in 26 bytes)!
An optimising compiler SHOULD but generate 8 instructions in 16 bytes:

        xor     eax, eax
        mov     ecx, DWORD PTR [esp+4]
        add     ecx, ecx
        jnz     .L1                     # binary32 != -binary32?
        sbb     eax, eax                # eax = (binary32 < 0) ? -1 : 0
        stc                             # CF = 1
        adc     eax, eax                # eax = (binary32 < 0) ? -1 : 1
.L1:
        ret


A TRUE optimising compiler would  butgenerate the following branch-free
code, using 7 or 8 instructions in 19 or 18 bytes:

.if 0
        mov     eax, DWORD PTR [esp+4]
        neg     eax                     # OF = (binary32 == -0.0),
                                        # ZF = (binary32 == +0.0)
.else
        xor     eax, eax
        sub     eax, DWORD PTR [esp+4]
.endif
        seto    ah
        setz    al
        sub     al, ah                  # al = ZF - OF
.if 0
        cbw
        cwde
.else
        movsx   eax, al
.endif
        ret

Stefan Kanthak

EPIC optimiser failures (i386)

Reply via email to