https://gcc.gnu.org/bugzilla/show_bug.cgi?id=87608
Bug ID: 87608 Summary: Very slow swap operations Product: gcc Version: 9.0 Status: UNCONFIRMED Severity: normal Priority: P3 Component: rtl-optimization Assignee: unassigned at gcc dot gnu.org Reporter: tkoenig at gcc dot gnu.org Target Milestone: --- The following test program I received from somebody else (reproduced with permission) takes about three times as many cycles using gcc as it does with clang - 1428 cycles vs. 544 cycles including measurement overhead. #include <stdio.h> extern "C" long rdtsc(void); #define cond_swap5(a,b);\ t = *(a);\ *(a) = (t<*(b))?t:*(b);\ *(b) = (t<*(b))?*(b):t; template<int n> void static_sort1(int *a){ return; } template<> void static_sort1<32>(int* first){ int t; static_sort1<16>(first); static_sort1<16>(first+16); cond_swap5(first + 0u, first + 16u); cond_swap5(first + 8u, first + 24u); cond_swap5(first + 8u, first + 16u); cond_swap5(first + 4u, first + 20u); cond_swap5(first + 12u, first + 28u); cond_swap5(first + 12u, first + 20u); cond_swap5(first + 4u, first + 8u); cond_swap5(first + 12u, first + 16u); cond_swap5(first + 20u, first + 24u); cond_swap5(first + 2u, first + 18u); cond_swap5(first + 10u, first + 26u); cond_swap5(first + 10u, first + 18u); cond_swap5(first + 6u, first + 22u); cond_swap5(first + 14u, first + 30u); cond_swap5(first + 14u, first + 22u); cond_swap5(first + 6u, first + 10u); cond_swap5(first + 14u, first + 18u); cond_swap5(first + 22u, first + 26u); cond_swap5(first + 2u, first + 4u); cond_swap5(first + 6u, first + 8u); cond_swap5(first + 10u, first + 12u); cond_swap5(first + 14u, first + 16u); cond_swap5(first + 18u, first + 20u); cond_swap5(first + 22u, first + 24u); cond_swap5(first + 26u, first + 28u); cond_swap5(first + 1u, first + 17u); cond_swap5(first + 9u, first + 25u); cond_swap5(first + 9u, first + 17u); cond_swap5(first + 5u, first + 21u); cond_swap5(first + 13u, first + 29u); cond_swap5(first + 13u, first + 21u); cond_swap5(first + 5u, first + 9u); cond_swap5(first + 13u, first + 17u); cond_swap5(first + 21u, first + 25u); cond_swap5(first + 3u, first + 19u); cond_swap5(first + 11u, first + 27u); cond_swap5(first + 11u, first + 19u); cond_swap5(first + 7u, first + 23u); cond_swap5(first + 15u, first + 31u); cond_swap5(first + 15u, first + 23u); cond_swap5(first + 7u, first + 11u); cond_swap5(first + 15u, first + 19u); cond_swap5(first + 23u, first + 27u); cond_swap5(first + 3u, first + 5u); cond_swap5(first + 7u, first + 9u); cond_swap5(first + 11u, first + 13u); cond_swap5(first + 15u, first + 17u); cond_swap5(first + 19u, first + 21u); cond_swap5(first + 23u, first + 25u); cond_swap5(first + 27u, first + 29u); cond_swap5(first + 1u, first + 2u); cond_swap5(first + 3u, first + 4u); cond_swap5(first + 5u, first + 6u); cond_swap5(first + 7u, first + 8u); cond_swap5(first + 9u, first + 10u); cond_swap5(first + 11u, first + 12u); cond_swap5(first + 13u, first + 14u); cond_swap5(first + 15u, first + 16u); cond_swap5(first + 17u, first + 18u); cond_swap5(first + 19u, first + 20u); cond_swap5(first + 21u, first + 22u); cond_swap5(first + 23u, first + 24u); cond_swap5(first + 25u, first + 26u); cond_swap5(first + 27u, first + 28u); cond_swap5(first + 29u, first + 30u); }; int main(){ int a[32]; long t1, t2; for (int i=0; i<32; i++) a[i] = 20*i - 32*i*i; t1 = rdtsc(); static_sort1<32>(a); t2 = rdtsc(); for (int i=0; i<32; i++) printf("%d ",a[i]); printf("\n %ld\n", t2-t1); return 0; } $ cat rdtsc.s .file "rdtsc.s" .text .globl rdtsc .type rdtsc, @function rdtsc: .LFB0: rdtsc shl $32, %rdx or %rdx, %rax ret .LFE0: .size rdtsc, .-rdtsc .section .note.GNU-stack,"",@progbits $ g++ -march=native -mtune=native -O3 j2.c rdtsc.s $ ./a.out -7872 -17952 -8908 -19500 -10008 -21112 -11172 -22788 -12400 -24528 -13692 -26332 -15048 -28200 -16468 -30132 0 -1888 -12 -2412 -88 -3000 -228 -3652 -432 -4368 -700 -5148 -1032 -5992 -1428 -6900 1428 $ clang++ -O3 -stdlib=libc++ j2.c rdtsc.s clang-3.8: warning: treating 'c' input as 'c++' when in C++ mode, this behavior is deprecated $ ./a.out -7872 -17952 -8908 -19500 -10008 -21112 -11172 -22788 -12400 -24528 -13692 -26332 -15048 -28200 -16468 -30132 0 -1888 -12 -2412 -88 -3000 -228 -3652 -432 -4368 -700 -5148 -1032 -5992 -1428 -6900 544 This is on x86_64-pc-linux-gnu with an AMD Ryzen 7.