https://gcc.gnu.org/bugzilla/show_bug.cgi?id=87608
Bug ID: 87608
Summary: Very slow swap operations
Product: gcc
Version: 9.0
Status: UNCONFIRMED
Severity: normal
Priority: P3
Component: rtl-optimization
Assignee: unassigned at gcc dot gnu.org
Reporter: tkoenig at gcc dot gnu.org
Target Milestone: ---
The following test program I received from somebody else (reproduced
with permission) takes about three times as many cycles using
gcc as it does with clang - 1428 cycles vs. 544 cycles including
measurement overhead.
#include <stdio.h>
extern "C" long rdtsc(void);
#define cond_swap5(a,b);\
t = *(a);\
*(a) = (t<*(b))?t:*(b);\
*(b) = (t<*(b))?*(b):t;
template<int n>
void static_sort1(int *a){
return;
}
template<>
void static_sort1<32>(int* first){
int t;
static_sort1<16>(first);
static_sort1<16>(first+16);
cond_swap5(first + 0u, first + 16u);
cond_swap5(first + 8u, first + 24u);
cond_swap5(first + 8u, first + 16u);
cond_swap5(first + 4u, first + 20u);
cond_swap5(first + 12u, first + 28u);
cond_swap5(first + 12u, first + 20u);
cond_swap5(first + 4u, first + 8u);
cond_swap5(first + 12u, first + 16u);
cond_swap5(first + 20u, first + 24u);
cond_swap5(first + 2u, first + 18u);
cond_swap5(first + 10u, first + 26u);
cond_swap5(first + 10u, first + 18u);
cond_swap5(first + 6u, first + 22u);
cond_swap5(first + 14u, first + 30u);
cond_swap5(first + 14u, first + 22u);
cond_swap5(first + 6u, first + 10u);
cond_swap5(first + 14u, first + 18u);
cond_swap5(first + 22u, first + 26u);
cond_swap5(first + 2u, first + 4u);
cond_swap5(first + 6u, first + 8u);
cond_swap5(first + 10u, first + 12u);
cond_swap5(first + 14u, first + 16u);
cond_swap5(first + 18u, first + 20u);
cond_swap5(first + 22u, first + 24u);
cond_swap5(first + 26u, first + 28u);
cond_swap5(first + 1u, first + 17u);
cond_swap5(first + 9u, first + 25u);
cond_swap5(first + 9u, first + 17u);
cond_swap5(first + 5u, first + 21u);
cond_swap5(first + 13u, first + 29u);
cond_swap5(first + 13u, first + 21u);
cond_swap5(first + 5u, first + 9u);
cond_swap5(first + 13u, first + 17u);
cond_swap5(first + 21u, first + 25u);
cond_swap5(first + 3u, first + 19u);
cond_swap5(first + 11u, first + 27u);
cond_swap5(first + 11u, first + 19u);
cond_swap5(first + 7u, first + 23u);
cond_swap5(first + 15u, first + 31u);
cond_swap5(first + 15u, first + 23u);
cond_swap5(first + 7u, first + 11u);
cond_swap5(first + 15u, first + 19u);
cond_swap5(first + 23u, first + 27u);
cond_swap5(first + 3u, first + 5u);
cond_swap5(first + 7u, first + 9u);
cond_swap5(first + 11u, first + 13u);
cond_swap5(first + 15u, first + 17u);
cond_swap5(first + 19u, first + 21u);
cond_swap5(first + 23u, first + 25u);
cond_swap5(first + 27u, first + 29u);
cond_swap5(first + 1u, first + 2u);
cond_swap5(first + 3u, first + 4u);
cond_swap5(first + 5u, first + 6u);
cond_swap5(first + 7u, first + 8u);
cond_swap5(first + 9u, first + 10u);
cond_swap5(first + 11u, first + 12u);
cond_swap5(first + 13u, first + 14u);
cond_swap5(first + 15u, first + 16u);
cond_swap5(first + 17u, first + 18u);
cond_swap5(first + 19u, first + 20u);
cond_swap5(first + 21u, first + 22u);
cond_swap5(first + 23u, first + 24u);
cond_swap5(first + 25u, first + 26u);
cond_swap5(first + 27u, first + 28u);
cond_swap5(first + 29u, first + 30u);
};
int main(){
int a[32];
long t1, t2;
for (int i=0; i<32; i++)
a[i] = 20*i - 32*i*i;
t1 = rdtsc();
static_sort1<32>(a);
t2 = rdtsc();
for (int i=0; i<32; i++)
printf("%d ",a[i]);
printf("\n %ld\n", t2-t1);
return 0;
}
$ cat rdtsc.s
.file "rdtsc.s"
.text
.globl rdtsc
.type rdtsc, @function
rdtsc:
.LFB0:
rdtsc
shl $32, %rdx
or %rdx, %rax
ret
.LFE0:
.size rdtsc, .-rdtsc
.section .note.GNU-stack,"",@progbits
$ g++ -march=native -mtune=native -O3 j2.c rdtsc.s
$ ./a.out
-7872 -17952 -8908 -19500 -10008 -21112 -11172 -22788 -12400 -24528 -13692
-26332 -15048 -28200 -16468 -30132 0 -1888 -12 -2412 -88 -3000 -228 -3652 -432
-4368 -700 -5148 -1032 -5992 -1428 -6900
1428
$ clang++ -O3 -stdlib=libc++ j2.c rdtsc.s
clang-3.8: warning: treating 'c' input as 'c++' when in C++ mode, this behavior
is deprecated
$ ./a.out
-7872 -17952 -8908 -19500 -10008 -21112 -11172 -22788 -12400 -24528 -13692
-26332 -15048 -28200 -16468 -30132 0 -1888 -12 -2412 -88 -3000 -228 -3652 -432
-4368 -700 -5148 -1032 -5992 -1428 -6900
544
This is on x86_64-pc-linux-gnu with an AMD Ryzen 7.