https://gcc.gnu.org/bugzilla/show_bug.cgi?id=107563
--- Comment #7 from cqwrteur <unlvsur at live dot com> --- (In reply to Hongtao.liu from comment #6) > Shufd only handles > > void foo1(temp_vec_type& v) noexcept > { > v=__builtin_shufflevector(v,v,12,13,14,15,8,9,10,11,4,5,6,7,0,1,2,3); > } > > Not the case in #c0. I am using it for byte swap actually, clang has a solution using x86_64_v4si [[__gnu__::__vector_size__ (16)]] = int; using x86_64_v16qi [[__gnu__::__vector_size__ (16)]] = char; using x86_64_v8hi [[__gnu__::__vector_size__ (16)]] = short; constexpr x86_64_v16qi zero{}; if constexpr(sizeof(T)==8) { auto res0{__builtin_ia32_punpcklbw128(temp_vec,zero)}; auto res1{__builtin_ia32_pshufd((x86_64_v4si)res0,78)}; auto res2{__builtin_ia32_pshuflw((x86_64_v8hi)res1,27)}; auto res3{__builtin_ia32_pshufhw(res2,27)}; auto res4{__builtin_ia32_punpckhbw128(temp_vec,zero)}; auto res5{__builtin_ia32_pshufd((x86_64_v4si)res4,78)}; auto res6{__builtin_ia32_pshuflw((x86_64_v8hi)res5,27)}; auto res7{__builtin_ia32_pshufhw(res6,27)}; temp_vec=__builtin_ia32_packuswb128(res3,res7); } else if constexpr(sizeof(T)==4) { auto res0{__builtin_ia32_punpcklbw128(temp_vec,zero)}; auto res2{__builtin_ia32_pshuflw((x86_64_v8hi)res0,27)}; auto res3{__builtin_ia32_pshufhw(res2,27)}; auto res4{__builtin_ia32_punpckhbw128(temp_vec,zero)}; auto res6{__builtin_ia32_pshuflw((x86_64_v8hi)res4,27)}; auto res7{__builtin_ia32_pshufhw(res6,27)}; temp_vec=__builtin_ia32_packuswb128(res3,res7); } else if constexpr(sizeof(T)==2) { using x86_64_v8hu [[__gnu__::__vector_size__ (16)]] = unsigned short; auto res0{(x86_64_v8hu)temp_vec}; temp_vec=(x86_64_v16qi)((res0>>8)|(res0<<8)); }