(I'll cross-post this to gcc and keep it on gcc-help after that.)

On Thu, Oct 6, 2011 at 4:46 PM, Andrew Haley <a...@redhat.com> wrote:
>
> inline int8_t as_signed_8 (unsigned int a) {
>  a &= 0xff;
>  return a & 0x80 ? (int)a - 0x100 : a;
> }
>
> int overflow(unsigned int a, unsigned int b) {
>  int sum = as_signed_8(a) + as_signed_8(b);
>  return as_signed_8(sum) != sum;
> }
>
> Andrew.
>

That's a really neat trick, and seems to generate identical code. Thanks!

I'd be interesting to know if this version produces equally efficient
code with MSVC.

To summarize what we have so far, here's four different methods along
with the code generated for X86 and ARM (GCC 4.5.2):

#include <inttypes.h>

inline int8_t as_signed_8(unsigned int a) {
    a &= 0xff;
    return a & 0x80 ? (int)a - 0x100 : a;
}

bool overflow_range(unsigned int a, unsigned int b) {
    const int sum = as_signed_8(a) + as_signed_8(b);
    return sum < -128 || sum > 127;
}

bool overflow_bit(unsigned int a, unsigned int b) {
    const unsigned int sum = a + b;
    return ~(a ^ b) & (a ^ sum) & 0x80;
}

bool overflow_unsafe(unsigned int a, unsigned int b) {
    const unsigned int sum = (int8_t)a + (int8_t)b;
    return (int8_t)sum != sum;
}

bool overflow_safe(unsigned int a, unsigned int b) {
    const int sum = as_signed_8(a) + as_signed_8(b);
    return as_signed_8(sum) != sum;
}



Output for X86 with -O3 -fomit-frame-pointer:

00000000 <_Z14overflow_rangejj>:
   0:   0f be 54 24 04          movsbl 0x4(%esp),%edx
   5:   0f be 44 24 08          movsbl 0x8(%esp),%eax
   a:   8d 84 02 80 00 00 00    lea    0x80(%edx,%eax,1),%eax
  11:   3d ff 00 00 00          cmp    $0xff,%eax
  16:   0f 97 c0                seta   %al
  19:   c3                      ret
  1a:   8d b6 00 00 00 00       lea    0x0(%esi),%esi

00000020 <_Z12overflow_bitjj>:
  20:   8b 54 24 08             mov    0x8(%esp),%edx
  24:   8b 4c 24 04             mov    0x4(%esp),%ecx
  28:   89 d0                   mov    %edx,%eax
  2a:   31 c8                   xor    %ecx,%eax
  2c:   01 ca                   add    %ecx,%edx
  2e:   31 ca                   xor    %ecx,%edx
  30:   f7 d0                   not    %eax
  32:   21 d0                   and    %edx,%eax
  34:   a8 80                   test   $0x80,%al
  36:   0f 95 c0                setne  %al
  39:   c3                      ret
  3a:   8d b6 00 00 00 00       lea    0x0(%esi),%esi

00000040 <_Z15overflow_unsafejj>:
  40:   0f be 54 24 08          movsbl 0x8(%esp),%edx
  45:   0f be 44 24 04          movsbl 0x4(%esp),%eax
  4a:   8d 04 02                lea    (%edx,%eax,1),%eax
  4d:   0f be d0                movsbl %al,%edx
  50:   39 c2                   cmp    %eax,%edx
  52:   0f 95 c0                setne  %al
  55:   c3                      ret
  56:   8d 76 00                lea    0x0(%esi),%esi
  59:   8d bc 27 00 00 00 00    lea    0x0(%edi,%eiz,1),%edi

00000060 <_Z13overflow_safejj>:
  60:   0f be 54 24 08          movsbl 0x8(%esp),%edx
  65:   0f be 44 24 04          movsbl 0x4(%esp),%eax
  6a:   8d 04 02                lea    (%edx,%eax,1),%eax
  6d:   0f be d0                movsbl %al,%edx
  70:   39 c2                   cmp    %eax,%edx
  72:   0f 95 c0                setne  %al
  75:   c3                      ret



Output for ARM with -O3 -fomit-frame-pointer -mthumb -march=armv7:

00000000 <_Z14overflow_rangejj>:
   0:   b249            sxtb    r1, r1
   2:   b240            sxtb    r0, r0
   4:   1808            adds    r0, r1, r0
   6:   3080            adds    r0, #128        ; 0x80
   8:   28ff            cmp     r0, #255        ; 0xff
   a:   bf94            ite     ls
   c:   2000            movls   r0, #0
   e:   2001            movhi   r0, #1
  10:   4770            bx      lr
  12:   bf00            nop
  14:   f3af 8000       nop.w
  18:   f3af 8000       nop.w
  1c:   f3af 8000       nop.w

00000020 <_Z12overflow_bitjj>:
  20:   180b            adds    r3, r1, r0
  22:   4041            eors    r1, r0
  24:   ea83 0200       eor.w   r2, r3, r0
  28:   ea22 0001       bic.w   r0, r2, r1
  2c:   f3c0 10c0       ubfx    r0, r0, #7, #1
  30:   4770            bx      lr
  32:   bf00            nop
  34:   f3af 8000       nop.w
  38:   f3af 8000       nop.w
  3c:   f3af 8000       nop.w

00000040 <_Z15overflow_unsafejj>:
  40:   b242            sxtb    r2, r0
  42:   b249            sxtb    r1, r1
  44:   1888            adds    r0, r1, r2
  46:   b243            sxtb    r3, r0
  48:   1a18            subs    r0, r3, r0
  4a:   bf18            it      ne
  4c:   2001            movne   r0, #1
  4e:   4770            bx      lr

00000050 <_Z13overflow_safejj>:
  50:   b242            sxtb    r2, r0
  52:   b249            sxtb    r1, r1
  54:   1888            adds    r0, r1, r2
  56:   b243            sxtb    r3, r0
  58:   1a18            subs    r0, r3, r0
  5a:   bf18            it      ne
  5c:   2001            movne   r0, #1
  5e:   4770            bx      lr


Not sure which version would be fastest on ARM (no device to benchmark
on handy).

By the way, what's a nice way to benchmark snippets like this with
optimization on? If you call each function in a loop from a different
compilation unit the call overhead tends to dominate. If you instead
put it in the same compilation unit and inline, the compiler might do
things you do not expect that renders the benchmark useless.

/Ulf

Reply via email to