https://gcc.gnu.org/bugzilla/show_bug.cgi?id=97282
Bug ID: 97282 Summary: division done twice for modulo and divsion for 128-bit integers Product: gcc Version: 11.0 Status: UNCONFIRMED Severity: normal Priority: P3 Component: rtl-optimization Assignee: unassigned at gcc dot gnu.org Reporter: tkoenig at gcc dot gnu.org Target Milestone: --- Currently, gcc calls the (long and slow) division routines for 128-bit integers twice when both the residual and the value is needed. For the other integer types, this is optimized. Take this test case: $ cat digsum.c #include <stdio.h> #include <x86intrin.h> typedef __uint128_t myint; unsigned long digsum1 (myint x) { unsigned long ret; if (x == 0) return 0; ret = 0; while (x > 0) { ret = ret + x % 10; x = x / 10; } return ret; } unsigned long digsum2 (myint x) { unsigned long ret; myint tmp; if (x == 0) return 0; ret = 0; while (x > 0) { tmp = x / 10; ret = ret + (x - tmp * 10); x = tmp; } return ret; } #define NUM 1000000 int main() { myint x; unsigned long sum; long int t1, t2; __uint128_t from, to; from = 1; from = (from << 93) - NUM/2; to = from + NUM; sum = 0; t1 = __rdtsc(); for (x=from; x<to; x++) sum = sum + digsum1(x); t2 = __rdtsc(); printf ("digsum1:\nsum = %lu\n", sum); printf ("cycles per sum = %.2f\n\n", (double) (t2-t1)/NUM); sum = 0; t1 = __rdtsc(); for (x=from; x<to; x++) sum = sum + digsum2(x); t2 = __rdtsc(); printf ("digsum2:\nsum = %lu\n", sum); printf ("cycles per sum = %.2f\n", (double) (t2-t1)/NUM); return 0; } "As is", this gives on my machine $ gcc -O3 digsum.c $ ./a.out digsum1: sum = 113493792 cycles per sum = 2021.68 digsum2: sum = 113493792 cycles per sum = 1025.47 (similar timings if a signed type is used). This also affects Fortran I/O.