The following source implements the __absv?i2() functions (see
<https://gcc.gnu.org/onlinedocs/gccint/Integer-library-routines.html>)
for 32-bit, 64-bit and 128-bit integers in 3 different ways:

--- ub_or_!ub.c ---
// Copyleft 2014-2020, Stefan Kanthak

#ifdef __amd64__
__int128_t __absuti2(__int128_t argument) {
    if (argument < 0)
        argument = -argument;
    if (argument < 0)
        __builtin_trap();
    return argument;
}

__int128_t __absvti2(__int128_t argument) {
    const __int128_t sign = 0 - (argument < 0);
    argument += sign;
    argument ^= sign;
    if (argument < 0)
        __builtin_trap();
    return argument;
}

__int128_t __abswti2(__int128_t argument) {
    const __int128_t sign = argument >> 127;
    argument ^= sign;
    argument -= sign;
    if (argument < 0)
        __builtin_trap();
    return argument;
}
#endif // __amd64__

long long __absudi2(long long argument) {
    if (argument < 0)
        argument = -argument;
    if (argument < 0)
        __builtin_trap();
    return argument;
}

long long __absvdi2(long long argument) {
    const long long sign = 0 - (argument < 0);
    argument ^= sign;
    argument -= sign;
    if (argument < 0)
        __builtin_trap();
    return argument;
}

long long __abswdi2(long long argument) {
    const long long sign = argument >> 63;
    argument += sign;
    argument ^= sign;
    if (argument < 0)
        __builtin_trap();
    return argument;
}

int __absusi2(int argument) {
    if (argument < 0)
        argument = -argument;
    if (argument < 0)
        __builtin_trap();
    return argument;
}

int __absvsi2(int argument) {
    const int sign = 0 - (argument < 0);
    argument ^= sign;
    argument -= sign;
    if (argument < 0)
        __builtin_trap();
    return argument;
}

int __abswsi2(int argument) {
    const int sign = argument >> 31;
    argument += sign;
    argument ^= sign;
    if (argument < 0)
        __builtin_trap();
    return argument;
}
--- EOF ---

Compile it with GCC 10.2, printing the assembly:

gcc -o- -O3 -S -Wall -Wextra ub_or_!ub.c

NOTE: older versions of GCC generate BAD code for the expression
      0 - (argument < 0)

(Output rearranged in 3 columns to ease comparision)

__absuti2:                   __absvti2:                   __abswti2:
        movq    %rsi, %rax           movq    %rdi, %rax           movq    %rsi, 
%rax
        movq    %rdi, %r8            movq    %rsi, %rdi           movq    %rdi, 
%r8
        movq    %rsi, %rcx           movq    %rsi, %rdx           movq    %rsi, 
%rcx
        sarq    $63, %rax            sarq    $63, %rdi            sarq    $63, 
%rax
        movq    %rax, %rsi           movslq  %edi, %rcx           movq    %rax, 
%rsi
        xorq    %rax, %r8            movq    %rcx, %rsi           xorq    %rax, 
%r8
        xorq    %rsi, %rcx           sarq    $63, %rcx            xorq    %rsi, 
%rcx
        movq    %r8, %rax            addq    %rsi, %rax           movq    %r8, 
%rax
        movq    %rcx, %rdx           movq    %rcx, %rdi           movq    %rcx, 
%rdx
        subq    %rsi, %rax           adcq    %rcx, %rdx           subq    %rsi, 
%rax
        sbbq    %rsi, %rdx           xorq    %rsi, %rax           sbbq    %rsi, 
%rdx
        ret                          xorq    %rdi, %rdx           testq   %rdx, 
%rdx
                                     jns     .L2                  jns     .L5
                                     ud2                          ud2
                             .L2:                         .L5:
                                     ret                          ret


__absudi2:                   __absvdi2:                   __abswdi2:
        movq    %rdi, %rax           movq    %rdi, %rax           movq    %rdi, 
%rdx
        cqto                         cqto                         sarq    $63, 
%rdx
        xorq    %rdx, %rax           movslq  %edx, %rdx           leaq    
(%rdi,%rdx), %rax
        subq    %rdx, %rax           xorq    %rdx, %rax           xorq    %rdx, 
%rax
        ret                          subq    %rdx, %rax           jns     .L10
                                     jns     .L8                  ud2
                                     ud2                  .L10:
                             .L8:                                 ret
                                     ret


__absusi2:                   __absvsi2:                   __abswsi2:
        movl    %edi, %eax           movl    %edi, %eax           movl    %edi, 
%edx
        cltd                         movl    %edi, %edx           sarl    $31, 
%edx
        xorl    %edx, %eax           shrl    $31, %eax            leal    
(%rdi,%rdx), %eax
        subl    %edx, %eax           movl    %eax, %edi           xorl    %edx, 
%eax
        ret                          negl    %edi                 jns     .L15
                                     xorl    %edx, %edi           ud2
                                     addl    %edi, %eax   .L15:
                                     jns     .L13                 ret
                                     ud2
                             .L13:
                                     ret


1. The 3 absu?i2() functions demonstrate that GCC (ab)uses the undefined
   behaviour of unary minus/negation for INT_MIN to "optimise" the test
   for overflow detection following the negation away -- WITHOUT warning
   the user, despite the -Wall and -Wextra options!

2. The 3 absv?i2() and the 3 absw?i2() functions demonstrate that GCC
   FAILS to recognise the two common and well-known patterns for abs()
   -- although it uses this pattern itself -- and does NOT remove the
   test for overflow detection.

Is this inconsistent behaviour intended?

3. The MOVSLQ instruction in the __absvdi2() function is nonsense: the
   preceeding CQTO instruction just extended the (sign of the) argument
   from RAX into RDX

4. The code generated for the __absvsi2() function is clumsy and quite
   BAD: GCC should generate the same code as for the __absvdi2() function,
   using the 32-bit registers instead of the 64-bit registers.

5. The register allocation in the __abswsi2() and __abswdi2() functions
   is BAD; the LEA instruction should be replaced with a shorter ADD,
   and the SAR with a shorter CLTD/CQTO:

__abswsi2:                   __abswsi2:                   __abswdi2:
        movl    %edi, %eax           movl    %edi, %eax           movq    %rdi, 
%rax
        sarl    $31, %edi            cltd                         cqto
        addl    %edi, %eax           addl    %edx, %eax           addq    %rdx, 
%rax
        xorl    %edi, %eax           xorl    %edx, %eax           xorq    %rdx, 
%rax
        jns     .L15                 jns     .L15                 jns     .L10
        ud2                          ud2                          ud2
.L15:                        .L15:                        .L10:
        ret                          ret                          ret

6. The register allocation in the __abs?ti2() functions is also REALLY
   bad, leading to 4 superfluous MOV instructions; additionally the
   MOVSLQ and TESTQ as well as the second SARQ are superfluous:

__absvti2:
        movq    %rsi, %rax
        cqto
        movq    %rdx, %rax
        addq    %rdx, %rdi
        adcq    %rdx, %rsi
        xorq    %rdi, %rax
        xorq    %rsi, %rdx
        jns     .L2
        ud2
.L2:
        ret

Conclusion: there's MUCH room for improvement in the code generator and
the (peephole) optimiser! 

Stefan

Reply via email to