", div, mod and mul where operands are all positive

Explorer09 at gmail dot com via Gcc-bugs Wed, 20 Mar 2024 05:16:18 -0700

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=114406


            Bug ID: 114406
           Summary: Optimizations with ">>", div, mod and mul where
                    operands are all positive
           Product: gcc
           Version: 13.2.0
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: middle-end
          Assignee: unassigned at gcc dot gnu.org
          Reporter: Explorer09 at gmail dot com
  Target Milestone: ---

GCC missed some optimizations on the right shift (>>), division (/)
and modulo (%) operations where the signedness of the operations,
and the bit width of the operations, do not matter.

When the operands can be found at compile time to be all positive,
and the results can't overflow, then it won't matter whether a signed
(arithmetic) right shift or an unsigned (logical) shift is to be used.
The same applies to division, modulo and multiplication.

My main complaint was on the right shift, where I discovered that
(value = 0xFFFF >> count) and (value = 0xFFFFU >> count) can produce
different code size. Technically it shouldn't matter whether I used
that unsigned cast ('U' suffix) on the constant literal.

Here I mention two cases where the operands can be known to be
positive:
1. The operand is a constant, evaluated to be positive.
2. The operand originates from an unsigned type, up-cast to a longer,
   signed type.

(bug 102580 mentioned the third case where the operand can only be
nonnegative after a conditional, which is not part of my cases.)

Here is the test code where I think can help GCC developers.
The expected outcome would be all functions optimized to simply
{ return true; } statements. The actual results between GCC and Clang
are noted in comments.

```c
// Tested in Compiler Explorer (godbolt.org)
// GCC 13.2 x86-64 with '-Os' option
// Clang 18.1.0 x86-64 with '-Os' option

#include <stdbool.h>
#include <stdint.h>

// Right shift tests
// -----------------

bool test_rshift_1(unsigned char count) {
  // Positive constants in 'int' and 'unsigned int' types
  int a = 12345 >> count;
  int b = 12345U >> count;
  int32_t c = 0x7FFFFFFF >> count;
  uint32_t d = 0x7FFFFFFFU >> count;

  return a == b && (int32_t)c == d;
}
// test_rshift_1
// GCC fails; Clang succeeds
// (GCC can recognize 0x7FFFFFFF and 0x7FFFFFFFU are identical and
// emits one load instruction, but emits separate SAR and SHR (x86)
// instructions not knowing they make the same results in this case.)

bool test_rshift_2a(unsigned char count) {
  // Positive constants with different bit width
  long long a = 0x7FFFFFFF >> count;
  long long b = 0x7FFFFFFFLL >> count;
  unsigned long long c = 0x7FFFFFFFU >> count;
  unsigned long long d = 0x7FFFFFFFULL >> count;

  return a == b && c == d && (unsigned long long)b == d;
}
// test_rshift_2a
// Both GCC and Clang fail
// (Both do not recognize 0x7FFFFFFF and 0x7FFFFFFFLL are identical
// and that the load instructions can be merged.)

bool test_rshift_2b(unsigned char count) {
  // By the C standard, 0x80000000 should be interpreted as unsigned
  // but ultimately the width and signedness do not matter
  long long a = 0x80000000LL >> count;
  long long b = 0x80000000U >> count;
  long long c = 0x80000000 >> count;

  return a == b && b == c;
}
// test_rshift_2b
// Both GCC and Clang fail (details same as above)

bool test_rshift_3(uint16_t x, unsigned char count) {
  // 'x' is known to be positive despite the signed type up-cast
  int32_t a = (int32_t)x >> count;
  uint32_t b = (uint32_t)x >> count;

  return (uint32_t)a == b;
}
// test_rshift_3
// GCC fails (SAR and SHR not merged); Clang succeeds

bool test_rshift_4(uint32_t x, unsigned char count) {
  // 'x' is known to be positive despite the signed type up-cast
  uint32_t a = x >> count;
  int64_t b = (int64_t)x >> count;

  return a == (uint32_t)b && (int64_t)a == b;
}
// test_rshift_4
// Both GCC and Clang fail
// (Both missed that type width do not matter here; GCC also missed
// on SAR and SHR not merged.
// Interesting note: GCC can omit the (a == (uint32_t)b) comparison,
// but only when the ((int64_t)a == b) comparison is presented
// first. In this particular example, GCC missed as well.)

// Division and modulo tests
// -------------------------

bool test_div_1(uint8_t divisor) {
  // The signedness of division do not matter when the dividend and
  // divisor are both positive.
  // Here the dividend is a constant, and divisor is promoted from
  // an unsigned type.
  // (The C standard requires 'int' to have at least 16 bits)
  int a = 12345 / divisor;
  int b = 12345U / divisor;
  return a == b;
}
// test_div_1
// GCC fails; Clang succeeds
// (GCC can recognize identical constants and emits one load
// instruction, but emits separate IDIV and DIV (x86) instructions
// not knowing they can be merged.)

bool test_div_2(uint8_t divisor) {
  // Positive constants with different bit width
  int16_t a = (int16_t)(32767 / divisor);
  int32_t b = (int32_t)32767 / divisor;
  long long c = 32767LL / divisor;

  uint16_t d = (uint16_t)(32767 / divisor);
  uint32_t e = (uint32_t)32767U / divisor;
  unsigned long long f = 32767ULL / divisor;

  return a == b && b == c && d == e && e == f \
    && (unsigned long long)c == f;
}
// test_div_2
// GCC fails; Clang succeeds
// (GCC fails in two ways: Not recognizing 32767 and 32767LL can
// merge to one load instruction; and not merging the IDIV and DIV
// instructions when the signedness do not matter. It emits 4
// division instructions in total: IDIV (r/m32), IDIV (r/m64),
// DIV (r/m32) and DIV (r/m64).)

bool test_div_3(uint16_t dividend, uint8_t divisor) {
  int32_t a = (int32_t)dividend / divisor;
  uint32_t b = (uint32_t)dividend / divisor;

  return (uint32_t)a == b;
}
// test_div_3
// GCC fails (IDIV and DIV not merged); Clang succeeds

bool test_div_4a(uint32_t dividend, uint32_t divisor) {
  uint32_t a = dividend / divisor;
  int64_t b = (int64_t)dividend / (int64_t)divisor;
  uint64_t c = (uint64_t)dividend / (uint64_t)divisor;

  return a == (uint32_t)b && (int64_t)a == b && a == (uint32_t)c;
}
bool test_div_4b(uint16_t dividend, uint16_t divisor) {
  uint32_t a = dividend / divisor;
  int64_t b = (int64_t)dividend / (int64_t)divisor;
  uint64_t c = (uint64_t)dividend / (uint64_t)divisor;

  return a == (uint32_t)b && (int64_t)a == b && a == (uint32_t)c;
}
bool test_div_4c(uint32_t dividend, uint16_t divisor) {
  uint32_t a = dividend / divisor;
  int64_t b = (int64_t)dividend / (int64_t)divisor;
  uint64_t c = (uint64_t)dividend / (uint64_t)divisor;

  return a == (uint32_t)b && (int64_t)a == b && a == (uint32_t)c;
}
// test_div_4*
// GCC succeeds in optimizing test_div_4a and test_div_4b, but
// failed in test_div_4c.
// Clang succeeds in optimizing all three.

bool test_mod_1(uint8_t divisor) {
  int a = 12345 % divisor;
  int b = 12345U % divisor;
  return a == b;
}

bool test_mod_2(uint8_t divisor) {
  int16_t a = (int16_t)(32767 % divisor);
  int32_t b = (int32_t)32767 % divisor;
  long long c = 32767LL % divisor;

  uint16_t d = (uint16_t)(32767 % divisor);
  uint32_t e = (uint32_t)32767U % divisor;
  unsigned long long f = 32767ULL % divisor;

  return a == b && b == c && d == e && e == f \
    && (unsigned long long)c == f;
}

bool test_mod_3(uint16_t dividend, uint8_t divisor) {
  int32_t a = (int32_t)dividend % divisor;
  uint32_t b = (uint32_t)dividend % divisor;

  return (uint32_t)a == b;
}

bool test_mod_4a(uint32_t dividend, uint32_t divisor) {
  uint32_t a = dividend % divisor;
  int64_t b = (int64_t)dividend % (int64_t)divisor;
  uint64_t c = (uint64_t)dividend % (uint64_t)divisor;

  return a == (uint32_t)b && (int64_t)a == b && a == (uint32_t)c;
}
bool test_mod_4b(uint16_t dividend, uint16_t divisor) {
  uint32_t a = dividend % divisor;
  int64_t b = (int64_t)dividend % (int64_t)divisor;
  uint64_t c = (uint64_t)dividend % (uint64_t)divisor;

  return a == (uint32_t)b && (int64_t)a == b && a == (uint32_t)c;
}
bool test_mod_4c(uint32_t dividend, uint16_t divisor) {
  uint32_t a = dividend % divisor;
  int64_t b = (int64_t)dividend % (int64_t)divisor;
  uint64_t c = (uint64_t)dividend % (uint64_t)divisor;

  return a == (uint32_t)b && (int64_t)a == b && a == (uint32_t)c;
}

// Multiplication tests
// --------------------

bool test_mul_2(uint16_t multiplier) {
  // "32765 * 65535" cannot overflow a signed 32-bit integer, and
  // both operands are positive, thus the "signedness" of the
  // multiplication should not matter.
  int32_t a = (int32_t)32765 * multiplier;
  long long b = 32765LL * multiplier;
  uint32_t c = (uint32_t)32765U * multiplier;
  unsigned long long d = 32765ULL * multiplier;

  return a == b && c == d && (unsigned long long)b == d;
}
// test_mul_2
// GCC fails; Clang succeeds
// (GCC misses that 32765 and 32765LL can merge to one load
// instruction, but otherwise understands (a == c) and (b == d).)

bool test_mul_4a(uint16_t x, uint8_t y) {
  int32_t a = (int32_t)x * y;
  uint32_t b = (uint32_t)x * y;
  long long c = (long long)x * y;
  unsigned long long d = (unsigned long long)x * y;

  return (uint32_t)a == b && (unsigned long long)c == d && b == d;
}
bool test_mul_4b(uint8_t x, uint8_t y) {
  int32_t a = (int32_t)x * y;
  uint32_t b = (uint32_t)x * y;
  long long c = (long long)x * y;
  unsigned long long d = (unsigned long long)x * y;

  return (uint32_t)a == b && (unsigned long long)c == d && b == d;
}
bool test_mul_4c(uint16_t x, uint16_t y) {
  // (x * y) could overflow a signed 'int32_t' so it is not included
  // in the test.
  uint32_t b = (uint32_t)x * y;
  long long c = (long long)x * y;
  unsigned long long d = (unsigned long long)x * y;

  return (unsigned long long)c == d && b == d;
}
// test_mul_4*
// GCC fails; Clang succeeds
// (GCC misses that ((int32_t)x * y) and ((int64_t)x * y) are
// identical and can merge the IMUL (x86) operations.
// Clang succeeds in optimizing all three.)
```

[Bug middle-end/114406] New: Optimizations with ">>", div, mod and mul where operands are all positive

Reply via email to