https://gcc.gnu.org/bugzilla/show_bug.cgi?id=97282
Bug ID: 97282
Summary: division done twice for modulo and divsion for 128-bit
integers
Product: gcc
Version: 11.0
Status: UNCONFIRMED
Severity: normal
Priority: P3
Component: rtl-optimization
Assignee: unassigned at gcc dot gnu.org
Reporter: tkoenig at gcc dot gnu.org
Target Milestone: ---
Currently, gcc calls the (long and slow) division routines for 128-bit
integers twice when both the residual and the value is needed. For
the other integer types, this is optimized.
Take this test case:
$ cat digsum.c
#include <stdio.h>
#include <x86intrin.h>
typedef __uint128_t myint;
unsigned long digsum1 (myint x)
{
unsigned long ret;
if (x == 0)
return 0;
ret = 0;
while (x > 0)
{
ret = ret + x % 10;
x = x / 10;
}
return ret;
}
unsigned long digsum2 (myint x)
{
unsigned long ret;
myint tmp;
if (x == 0)
return 0;
ret = 0;
while (x > 0)
{
tmp = x / 10;
ret = ret + (x - tmp * 10);
x = tmp;
}
return ret;
}
#define NUM 1000000
int main()
{
myint x;
unsigned long sum;
long int t1, t2;
__uint128_t from, to;
from = 1;
from = (from << 93) - NUM/2;
to = from + NUM;
sum = 0;
t1 = __rdtsc();
for (x=from; x<to; x++)
sum = sum + digsum1(x);
t2 = __rdtsc();
printf ("digsum1:\nsum = %lu\n", sum);
printf ("cycles per sum = %.2f\n\n", (double) (t2-t1)/NUM);
sum = 0;
t1 = __rdtsc();
for (x=from; x<to; x++)
sum = sum + digsum2(x);
t2 = __rdtsc();
printf ("digsum2:\nsum = %lu\n", sum);
printf ("cycles per sum = %.2f\n", (double) (t2-t1)/NUM);
return 0;
}
"As is", this gives on my machine
$ gcc -O3 digsum.c
$ ./a.out
digsum1:
sum = 113493792
cycles per sum = 2021.68
digsum2:
sum = 113493792
cycles per sum = 1025.47
(similar timings if a signed type is used).
This also affects Fortran I/O.