Without SSSE3/SSE4, GCC falls back to converting scalars, which is not efficient.
Signed-off-by: Matthias Kretz <[email protected]> libstdc++-v3/ChangeLog: * include/bits/simd_vec.h (basic_vec::basic_vec): Improve code-gen of conversions from 4-Byte 2/3/4 register vec to single-register 1/2-Byte integer. * include/bits/vec_ops.h (__vec_interleave_lo) (__vec_interleave_hi): New. (__vec_cast): Improve code-gen of conversions without SSE4 instructions. * testsuite/std/simd/sse2_efficient_cvt_x4.cc: New test. --- libstdc++-v3/include/bits/simd_vec.h | 84 ++++++++++++++ libstdc++-v3/include/bits/vec_ops.h | 70 ++++++++++++ .../std/simd/sse2_efficient_cvt_x4.cc | 105 ++++++++++++++++++ 3 files changed, 259 insertions(+) create mode 100644 libstdc++-v3/testsuite/std/simd/sse2_efficient_cvt_x4.cc -- ────────────────────────────────────────────────────────────────────────── Dr. Matthias Kretz https://mattkretz.github.io GSI Helmholtz Center for Heavy Ion Research https://gsi.de std::simd ──────────────────────────────────────────────────────────────────────────
diff --git a/libstdc++-v3/include/bits/simd_vec.h b/libstdc++-v3/include/bits/simd_vec.h
index ee6d78776bd9..82b7c3bba02d 100644
--- a/libstdc++-v3/include/bits/simd_vec.h
+++ b/libstdc++-v3/include/bits/simd_vec.h
@@ -1143,6 +1143,90 @@ basic_vec(const basic_vec<_Up, _UAbi>& __x) noexcept
: _M_data([&] [[__gnu__::__always_inline__]] {
if constexpr (_S_is_scalar)
return static_cast<value_type>(__x[0]);
+#if _GLIBCXX_X86
+ // Without the pshufb instruction (SSSE3) the following conversions need some help for
+ // good code-gen.
+ else if constexpr (!_Traits._M_have_ssse3()
+ && _UAbi::_S_nreg == 2 && sizeof(__x) == 32
+ && sizeof(value_type) == 2 && sizeof(_M_data) == 16)
+ {
+ if constexpr (is_floating_point_v<_Up>)
+ return basic_vec(rebind_t<int, basic_vec>(__x))._M_data;
+ else
+ {
+ auto __a = reinterpret_cast<_DataType>( // a.b.c.d.
+ __x._M_data0._M_data);
+ auto __b = reinterpret_cast<_DataType>( // e.f.g.h.
+ __vec_zero_pad_to<16>(__x._M_data1._M_concat_data()));
+ auto __c = __vec_interleave_lo(__a, __b); // ae..bf..
+ auto __d = __vec_interleave_hi(__a, __b); // cg..dh..
+ auto __e = __vec_interleave_lo(__c, __d); // aceg....
+ auto __f = __vec_interleave_hi(__c, __d); // bdfh....
+ return __vec_interleave_lo(__e, __f); // abcdefgh
+ }
+ }
+ else if constexpr (!_Traits._M_have_ssse3()
+ && _UAbi::_S_nreg == 2 && (sizeof(__x) == 32)
+ && sizeof(value_type) == 1 && sizeof(_M_data) == 8)
+ {
+ if constexpr (is_floating_point_v<_Up>)
+ return basic_vec(rebind_t<int, basic_vec>(__x))._M_data;
+ else
+ {
+ using _TV = __vec_builtin_type_bytes<__canon_value_type, 16>;
+ _TV __a = reinterpret_cast<_TV>(__x._M_data0._M_data);
+ _TV __b = reinterpret_cast<_TV>(
+ __vec_zero_pad_to<16>(__x._M_data1._M_concat_data()));
+ auto __c = __vec_interleave_lo(__a, __b); // aeim....bfjn....
+ auto __d = __vec_interleave_hi(__a, __b); // cgko....dhlp....
+ auto __e = __vec_interleave_lo(__c, __d); // acegikmo........
+ auto __f = __vec_interleave_hi(__c, __d); // bdfhjlnp........
+ auto __g = __vec_interleave_lo(__e, __f); // abcdefghijklmnop
+ return __vec_split_lo(__g);
+ }
+ }
+ else if constexpr (!_Traits._M_have_ssse3()
+ && _UAbi::_S_nreg <= 4 && (sizeof(__x) == 48 || sizeof(__x) == 64)
+ && sizeof(value_type) == 1 && sizeof(_M_data) == 16)
+ {
+ if constexpr (is_floating_point_v<_Up>)
+ return basic_vec(rebind_t<int, basic_vec>(__x))._M_data;
+ else
+ {
+ // a...b...c...d...
+ // .i...j...k...l..
+ _DataType __a;
+ _DataType __b;
+ // e...f...g...h...
+ // .m...n...o...p..
+ if constexpr (_UAbi::_S_nreg == 3)
+ {
+ __a = reinterpret_cast<_DataType>( // ai..bj..ck..dl..
+ (__x._M_data0._M_data0 & 0xff)._M_data
+ | __vec_zero_pad_to<16>((__x._M_data1 << 8)._M_concat_data()));
+ __b = reinterpret_cast<_DataType>(__x._M_data0._M_data1._M_data);
+ }
+ else if constexpr (_UAbi::_S_nreg == 4)
+ {
+ __a = reinterpret_cast<_DataType>( // ai..bj..ck..dl..
+ (__x._M_data0._M_data0 & 0xff)._M_data
+ | (__x._M_data1._M_data0 << 8)._M_data);
+ __b = reinterpret_cast<_DataType>( // em..fn..go..hp..
+ (__x._M_data0._M_data1 & 0xff)._M_data
+ | __vec_zero_pad_to<16>(
+ (__x._M_data1._M_data1 << 8)._M_concat_data()));
+ }
+ else
+ static_assert(false);
+ auto __c = __vec_interleave_lo(__a, __b); // aeim....bfjn....
+ auto __d = __vec_interleave_hi(__a, __b); // cgko....dhlp....
+ auto __e = __vec_interleave_lo(__c, __d); // acegikmo........
+ auto __f = __vec_interleave_hi(__c, __d); // bdfhjlnp........
+ auto __g = __vec_interleave_lo(__e, __f); // abcdefghijklmnop
+ return __g;
+ }
+ }
+#endif
else if constexpr (_UAbi::_S_nreg >= 2)
// __builtin_convertvector (__vec_cast) is inefficient for over-sized inputs.
// Also e.g. vec<float, 12> -> vec<char, 12> (with SSE2) would otherwise emit 4
diff --git a/libstdc++-v3/include/bits/vec_ops.h b/libstdc++-v3/include/bits/vec_ops.h
index e5bf2f1497cd..8cb77bf10969 100644
--- a/libstdc++-v3/include/bits/vec_ops.h
+++ b/libstdc++-v3/include/bits/vec_ops.h
@@ -327,6 +327,33 @@ __is_const_known_equal_to(const auto& __x, const auto& __expect)
__x86_cvt_f16c(_TV __v);
#endif
+ template <__vec_builtin _TV>
+ [[__gnu__::__always_inline__]]
+ constexpr _TV
+ __vec_interleave_lo(_TV __x, _TV __y)
+ {
+ constexpr int __n = __width_of<_TV>;
+ auto __rotr = [](int __i) consteval {
+ int __b = __i & 1;
+ return (__i >> 1) | (__b * __n);
+ };
+ constexpr auto [...__is] = _IotaArray<__n>;
+ return __builtin_shufflevector(__x, __y, __rotr(__is)...);
+ }
+
+ template <__vec_builtin _TV>
+ [[__gnu__::__always_inline__]]
+ constexpr _TV
+ __vec_interleave_hi(_TV __x, _TV __y)
+ {
+ constexpr int __n = __width_of<_TV>;
+ auto __rotr = [](int __i) consteval {
+ int __b = __i & 1;
+ return (__i >> 1) | (__b * __n);
+ };
+ constexpr auto [...__is] = _IotaArray<__n>;
+ return __builtin_shufflevector(__x, __y, __n / 2 + __rotr(__is)...);
+ }
/** \internal
* Simple wrapper around __builtin_convertvector to provide static_cast-like syntax.
@@ -358,6 +385,49 @@ __vec_cast(_TV __v)
using _IV = __vec_builtin_type<_Ip, __width_of<_TV>>;
return __vec_cast<_UV>(__vec_cast<_IV>(__v));
}
+ if constexpr (!_Traits._M_have_sse4_1() && is_integral_v<_Tp>
+ && sizeof(_Up) == sizeof(_Tp) * 4)
+ { // GCC uses scalar conversions unless it can use SSE4.1 instructions
+ if constexpr (!is_integral_v<_Up>)
+ {
+ using _Ip = __integer_from<std::min(sizeof(int), sizeof(_Up))>;
+ using _IV = __vec_builtin_type<_Ip, __width_of<_TV>>;
+ return __vec_cast<_UV>(__vec_cast<_IV>(__v));
+ }
+ else if constexpr (sizeof(_TV) == 2)
+ return __vec_split_lo(__vec_cast<__vec_builtin_type<_Up, __width_of<_UV> * 2>>(
+ __vec_concat(__v, _TV())));
+ else if constexpr (sizeof(_TV) == 4)
+ {
+ auto __v1 = __vec_zero_pad_to_16(__v);
+ using _V1 = decltype(__v1);
+ if constexpr (is_signed_v<_Tp>)
+ {
+ if constexpr (sizeof(_Tp) == 1)
+ {
+ auto __v2 = __vec_interleave_lo(__v1, __v1);
+ auto __v4 = __vec_interleave_lo(__v2, __v2);
+ return reinterpret_cast<_UV>(__vec_bit_cast<__integer_from<sizeof(_Up)>>(__v4)
+ >> __CHAR_BIT__ * (sizeof(_Up) - 1));
+ }
+ else
+ {
+ _V1 __sign = __v1 < 0;
+ using _V2 = __vec_builtin_type_bytes<__integer_from<sizeof(_Tp) * 2>,
+ sizeof(__v1)>;
+ _V2 __v2 = reinterpret_cast<_V2>(__vec_interleave_lo(__v1, __sign));
+ _V2 __s2 = reinterpret_cast<_V2>(__vec_interleave_lo(__sign, __sign));
+ return reinterpret_cast<_UV>(__vec_interleave_lo(__v2, __s2));
+ }
+ }
+ else
+ {
+ auto __v2 = __vec_interleave_lo(__v1, _V1());
+ auto __v4 = __vec_interleave_lo(__v2, _V1());
+ return reinterpret_cast<_UV>(__v4);
+ }
+ }
+ }
#endif
return __builtin_convertvector(__v, _UV);
}
diff --git a/libstdc++-v3/testsuite/std/simd/sse2_efficient_cvt_x4.cc b/libstdc++-v3/testsuite/std/simd/sse2_efficient_cvt_x4.cc
new file mode 100644
index 000000000000..aa2d02211aa4
--- /dev/null
+++ b/libstdc++-v3/testsuite/std/simd/sse2_efficient_cvt_x4.cc
@@ -0,0 +1,105 @@
+// { dg-do compile { target { x86_64-*-* && c++26 } } }
+// { dg-require-effective-target c++26 }
+// { dg-options "-march=x86-64 -O2" }
+// { dg-final { check-function-bodies "**" "" } }
+
+#include <simd>
+
+namespace simd = std::simd;
+
+extern "C"
+{
+/*
+** cvt0:
+** .*
+** punpcklbw.*
+** punpcklbw.*
+** psrad.*
+** cvtdq2ps.*
+*/
+simd::vec<float, 4>
+cvt0(simd::vec<signed char, 4> x)
+{ return x; }
+
+/*
+** cvt1:
+** .*
+** punpcklbw.*
+** punpcklbw.*
+** cvtdq2ps.*
+*/
+simd::vec<float, 4>
+cvt1(simd::vec<unsigned char, 4> x)
+{ return x; }
+
+/*
+** cvt2:
+** .*
+** psraw.*
+** punpcklwd.*
+** punpcklwd.*
+** punpckldq.*
+*/
+simd::vec<long long, 2>
+cvt2(simd::vec<short, 2> x)
+{ return x; }
+
+/*
+** cvt3:
+** .*
+** cvttps2dq.*
+** cvttps2dq.*
+** cvttps2dq.*
+** cvttps2dq.*
+** punpck[lh]bw.*
+** punpck[lh]bw.*
+** punpck[lh]bw.*
+** punpck[lh]bw.*
+** punpcklbw.*
+*/
+auto
+cvt3(simd::vec<float, 16> x)
+{ return static_cast<simd::vec<signed char, 16>>(x); }
+
+/*
+** cvt4:
+** .*
+** cvttps2dq.*
+** cvttps2dq.*
+** cvttps2dq.*
+** punpck[lh]bw.*
+** punpck[lh]bw.*
+** punpck[lh]bw.*
+** punpck[lh]bw.*
+** punpcklbw.*
+*/
+auto
+cvt4(simd::vec<float, 12> x)
+{ return static_cast<simd::vec<signed char, 12>>(x); }
+
+/*
+** cvt5:
+** .*
+** punpck[lh]bw.*
+** punpck[lh]bw.*
+** punpck[lh]bw.*
+** punpck[lh]bw.*
+** punpcklbw.*
+*/
+auto
+cvt5(simd::vec<int, 8> x)
+{ return static_cast<simd::vec<signed char, 8>>(x); }
+
+/*
+** cvt6:
+** .*
+** punpck[lh]wd.*
+** punpck[lh]wd.*
+** punpck[lh]wd.*
+** punpck[lh]wd.*
+** punpcklwd.*
+*/
+auto
+cvt6(simd::vec<int, 8> x)
+{ return static_cast<simd::vec<signed short, 8>>(x); }
+}
signature.asc
Description: This is a digitally signed message part.
