https://gcc.gnu.org/bugzilla/show_bug.cgi?id=114406
Bug ID: 114406 Summary: Optimizations with ">>", div, mod and mul where operands are all positive Product: gcc Version: 13.2.0 Status: UNCONFIRMED Severity: normal Priority: P3 Component: middle-end Assignee: unassigned at gcc dot gnu.org Reporter: Explorer09 at gmail dot com Target Milestone: --- GCC missed some optimizations on the right shift (>>), division (/) and modulo (%) operations where the signedness of the operations, and the bit width of the operations, do not matter. When the operands can be found at compile time to be all positive, and the results can't overflow, then it won't matter whether a signed (arithmetic) right shift or an unsigned (logical) shift is to be used. The same applies to division, modulo and multiplication. My main complaint was on the right shift, where I discovered that (value = 0xFFFF >> count) and (value = 0xFFFFU >> count) can produce different code size. Technically it shouldn't matter whether I used that unsigned cast ('U' suffix) on the constant literal. Here I mention two cases where the operands can be known to be positive: 1. The operand is a constant, evaluated to be positive. 2. The operand originates from an unsigned type, up-cast to a longer, signed type. (bug 102580 mentioned the third case where the operand can only be nonnegative after a conditional, which is not part of my cases.) Here is the test code where I think can help GCC developers. The expected outcome would be all functions optimized to simply { return true; } statements. The actual results between GCC and Clang are noted in comments. ```c // Tested in Compiler Explorer (godbolt.org) // GCC 13.2 x86-64 with '-Os' option // Clang 18.1.0 x86-64 with '-Os' option #include <stdbool.h> #include <stdint.h> // Right shift tests // ----------------- bool test_rshift_1(unsigned char count) { // Positive constants in 'int' and 'unsigned int' types int a = 12345 >> count; int b = 12345U >> count; int32_t c = 0x7FFFFFFF >> count; uint32_t d = 0x7FFFFFFFU >> count; return a == b && (int32_t)c == d; } // test_rshift_1 // GCC fails; Clang succeeds // (GCC can recognize 0x7FFFFFFF and 0x7FFFFFFFU are identical and // emits one load instruction, but emits separate SAR and SHR (x86) // instructions not knowing they make the same results in this case.) bool test_rshift_2a(unsigned char count) { // Positive constants with different bit width long long a = 0x7FFFFFFF >> count; long long b = 0x7FFFFFFFLL >> count; unsigned long long c = 0x7FFFFFFFU >> count; unsigned long long d = 0x7FFFFFFFULL >> count; return a == b && c == d && (unsigned long long)b == d; } // test_rshift_2a // Both GCC and Clang fail // (Both do not recognize 0x7FFFFFFF and 0x7FFFFFFFLL are identical // and that the load instructions can be merged.) bool test_rshift_2b(unsigned char count) { // By the C standard, 0x80000000 should be interpreted as unsigned // but ultimately the width and signedness do not matter long long a = 0x80000000LL >> count; long long b = 0x80000000U >> count; long long c = 0x80000000 >> count; return a == b && b == c; } // test_rshift_2b // Both GCC and Clang fail (details same as above) bool test_rshift_3(uint16_t x, unsigned char count) { // 'x' is known to be positive despite the signed type up-cast int32_t a = (int32_t)x >> count; uint32_t b = (uint32_t)x >> count; return (uint32_t)a == b; } // test_rshift_3 // GCC fails (SAR and SHR not merged); Clang succeeds bool test_rshift_4(uint32_t x, unsigned char count) { // 'x' is known to be positive despite the signed type up-cast uint32_t a = x >> count; int64_t b = (int64_t)x >> count; return a == (uint32_t)b && (int64_t)a == b; } // test_rshift_4 // Both GCC and Clang fail // (Both missed that type width do not matter here; GCC also missed // on SAR and SHR not merged. // Interesting note: GCC can omit the (a == (uint32_t)b) comparison, // but only when the ((int64_t)a == b) comparison is presented // first. In this particular example, GCC missed as well.) // Division and modulo tests // ------------------------- bool test_div_1(uint8_t divisor) { // The signedness of division do not matter when the dividend and // divisor are both positive. // Here the dividend is a constant, and divisor is promoted from // an unsigned type. // (The C standard requires 'int' to have at least 16 bits) int a = 12345 / divisor; int b = 12345U / divisor; return a == b; } // test_div_1 // GCC fails; Clang succeeds // (GCC can recognize identical constants and emits one load // instruction, but emits separate IDIV and DIV (x86) instructions // not knowing they can be merged.) bool test_div_2(uint8_t divisor) { // Positive constants with different bit width int16_t a = (int16_t)(32767 / divisor); int32_t b = (int32_t)32767 / divisor; long long c = 32767LL / divisor; uint16_t d = (uint16_t)(32767 / divisor); uint32_t e = (uint32_t)32767U / divisor; unsigned long long f = 32767ULL / divisor; return a == b && b == c && d == e && e == f \ && (unsigned long long)c == f; } // test_div_2 // GCC fails; Clang succeeds // (GCC fails in two ways: Not recognizing 32767 and 32767LL can // merge to one load instruction; and not merging the IDIV and DIV // instructions when the signedness do not matter. It emits 4 // division instructions in total: IDIV (r/m32), IDIV (r/m64), // DIV (r/m32) and DIV (r/m64).) bool test_div_3(uint16_t dividend, uint8_t divisor) { int32_t a = (int32_t)dividend / divisor; uint32_t b = (uint32_t)dividend / divisor; return (uint32_t)a == b; } // test_div_3 // GCC fails (IDIV and DIV not merged); Clang succeeds bool test_div_4a(uint32_t dividend, uint32_t divisor) { uint32_t a = dividend / divisor; int64_t b = (int64_t)dividend / (int64_t)divisor; uint64_t c = (uint64_t)dividend / (uint64_t)divisor; return a == (uint32_t)b && (int64_t)a == b && a == (uint32_t)c; } bool test_div_4b(uint16_t dividend, uint16_t divisor) { uint32_t a = dividend / divisor; int64_t b = (int64_t)dividend / (int64_t)divisor; uint64_t c = (uint64_t)dividend / (uint64_t)divisor; return a == (uint32_t)b && (int64_t)a == b && a == (uint32_t)c; } bool test_div_4c(uint32_t dividend, uint16_t divisor) { uint32_t a = dividend / divisor; int64_t b = (int64_t)dividend / (int64_t)divisor; uint64_t c = (uint64_t)dividend / (uint64_t)divisor; return a == (uint32_t)b && (int64_t)a == b && a == (uint32_t)c; } // test_div_4* // GCC succeeds in optimizing test_div_4a and test_div_4b, but // failed in test_div_4c. // Clang succeeds in optimizing all three. bool test_mod_1(uint8_t divisor) { int a = 12345 % divisor; int b = 12345U % divisor; return a == b; } bool test_mod_2(uint8_t divisor) { int16_t a = (int16_t)(32767 % divisor); int32_t b = (int32_t)32767 % divisor; long long c = 32767LL % divisor; uint16_t d = (uint16_t)(32767 % divisor); uint32_t e = (uint32_t)32767U % divisor; unsigned long long f = 32767ULL % divisor; return a == b && b == c && d == e && e == f \ && (unsigned long long)c == f; } bool test_mod_3(uint16_t dividend, uint8_t divisor) { int32_t a = (int32_t)dividend % divisor; uint32_t b = (uint32_t)dividend % divisor; return (uint32_t)a == b; } bool test_mod_4a(uint32_t dividend, uint32_t divisor) { uint32_t a = dividend % divisor; int64_t b = (int64_t)dividend % (int64_t)divisor; uint64_t c = (uint64_t)dividend % (uint64_t)divisor; return a == (uint32_t)b && (int64_t)a == b && a == (uint32_t)c; } bool test_mod_4b(uint16_t dividend, uint16_t divisor) { uint32_t a = dividend % divisor; int64_t b = (int64_t)dividend % (int64_t)divisor; uint64_t c = (uint64_t)dividend % (uint64_t)divisor; return a == (uint32_t)b && (int64_t)a == b && a == (uint32_t)c; } bool test_mod_4c(uint32_t dividend, uint16_t divisor) { uint32_t a = dividend % divisor; int64_t b = (int64_t)dividend % (int64_t)divisor; uint64_t c = (uint64_t)dividend % (uint64_t)divisor; return a == (uint32_t)b && (int64_t)a == b && a == (uint32_t)c; } // Multiplication tests // -------------------- bool test_mul_2(uint16_t multiplier) { // "32765 * 65535" cannot overflow a signed 32-bit integer, and // both operands are positive, thus the "signedness" of the // multiplication should not matter. int32_t a = (int32_t)32765 * multiplier; long long b = 32765LL * multiplier; uint32_t c = (uint32_t)32765U * multiplier; unsigned long long d = 32765ULL * multiplier; return a == b && c == d && (unsigned long long)b == d; } // test_mul_2 // GCC fails; Clang succeeds // (GCC misses that 32765 and 32765LL can merge to one load // instruction, but otherwise understands (a == c) and (b == d).) bool test_mul_4a(uint16_t x, uint8_t y) { int32_t a = (int32_t)x * y; uint32_t b = (uint32_t)x * y; long long c = (long long)x * y; unsigned long long d = (unsigned long long)x * y; return (uint32_t)a == b && (unsigned long long)c == d && b == d; } bool test_mul_4b(uint8_t x, uint8_t y) { int32_t a = (int32_t)x * y; uint32_t b = (uint32_t)x * y; long long c = (long long)x * y; unsigned long long d = (unsigned long long)x * y; return (uint32_t)a == b && (unsigned long long)c == d && b == d; } bool test_mul_4c(uint16_t x, uint16_t y) { // (x * y) could overflow a signed 'int32_t' so it is not included // in the test. uint32_t b = (uint32_t)x * y; long long c = (long long)x * y; unsigned long long d = (unsigned long long)x * y; return (unsigned long long)c == d && b == d; } // test_mul_4* // GCC fails; Clang succeeds // (GCC misses that ((int32_t)x * y) and ((int64_t)x * y) are // identical and can merge the IMUL (x86) operations. // Clang succeeds in optimizing all three.) ```