https://gcc.gnu.org/bugzilla/show_bug.cgi?id=82793

--- Comment #1 from Vorfeed Canal <vorfeed.canal at gmail dot com> ---
Example closer to real code:

#include "xmmintrin.h"

inline __attribute__((target("avx"),visibility("hidden"))) __m128 foo(__m128 a,
__m128 b) {
  return _mm_add_ps(a, b);
}

inline __attribute__((target("sse4.2"),visibility("hidden"))) __m128 foo(__m128
a, __m128 b) {
  return _mm_add_ps(a, b);
}

inline __attribute__((target("default"),visibility("hidden"))) __m128
foo(__m128 a, __m128 b) {
  return _mm_add_ps(a, b);
}

__attribute__((target("sse4.2"))) __m128 bar(__m128 a, __m128 b) {
  return foo(a, b);
}

__attribute__((target("avx"))) __m128 bar(__m128 a, __m128 b) {
  return foo(a, b);
}

__attribute__((target("default"))) __m128 bar(__m128 a, __m128 b) {
  return foo(a, b);
}

Result of compilation:

        .type   _Z3barDv4_fS_.avx, @function
_Z3barDv4_fS_.avx:
.LFB529:
        .cfi_startproc
        vaddps  %xmm1, %xmm0, %xmm0
        ret
        .cfi_endproc

        .type   _Z3barDv4_fS_.sse4.2, @function
_Z3barDv4_fS_.sse4.2:
.LFB542:
        .cfi_startproc
        jmp     _Z19_Z3fooDv4_fS_.ifuncDv4_fS_
        .cfi_endproc


Needless to say: when simple, short, INLINE function is called via resolver
performance takes unacceptable hit.

Reply via email to