[PATCH 3/5] libstdc++: Optimize simd conversions on x86-64

Matthias Kretz Mon, 02 Mar 2026 02:15:48 -0800

Without SSSE3/SSE4, GCC falls back to converting scalars, which is not
efficient.


Signed-off-by: Matthias Kretz <[email protected]>

libstdc++-v3/ChangeLog:

        * include/bits/simd_vec.h (basic_vec::basic_vec): Improve
        code-gen of conversions from 4-Byte 2/3/4 register vec to
        single-register 1/2-Byte integer.
        * include/bits/vec_ops.h (__vec_interleave_lo)
        (__vec_interleave_hi): New.
        (__vec_cast): Improve code-gen of conversions without SSE4
        instructions.
        * testsuite/std/simd/sse2_efficient_cvt_x4.cc: New test.
---
 libstdc++-v3/include/bits/simd_vec.h          |  84 ++++++++++++++
 libstdc++-v3/include/bits/vec_ops.h           |  70 ++++++++++++
 .../std/simd/sse2_efficient_cvt_x4.cc         | 105 ++++++++++++++++++
 3 files changed, 259 insertions(+)
 create mode 100644 libstdc++-v3/testsuite/std/simd/sse2_efficient_cvt_x4.cc


--
──────────────────────────────────────────────────────────────────────────
 Dr. Matthias Kretz                           https://mattkretz.github.io
 GSI Helmholtz Center for Heavy Ion Research               https://gsi.de
 std::simd
──────────────────────────────────────────────────────────────────────────

diff --git a/libstdc++-v3/include/bits/simd_vec.h b/libstdc++-v3/include/bits/simd_vec.h
index ee6d78776bd9..82b7c3bba02d 100644
--- a/libstdc++-v3/include/bits/simd_vec.h
+++ b/libstdc++-v3/include/bits/simd_vec.h
@@ -1143,6 +1143,90 @@ basic_vec(const basic_vec<_Up, _UAbi>& __x) noexcept
 	: _M_data([&] [[__gnu__::__always_inline__]] {
 	    if constexpr (_S_is_scalar)
 	      return static_cast<value_type>(__x[0]);
+#if _GLIBCXX_X86
+	    // Without the pshufb instruction (SSSE3) the following conversions need some help for
+	    // good code-gen.
+	    else if constexpr (!_Traits._M_have_ssse3()
+				 && _UAbi::_S_nreg == 2 && sizeof(__x) == 32
+				 && sizeof(value_type) == 2 && sizeof(_M_data) == 16)
+	      {
+		if constexpr (is_floating_point_v<_Up>)
+		  return basic_vec(rebind_t<int, basic_vec>(__x))._M_data;
+		else
+		  {
+		    auto __a = reinterpret_cast<_DataType>( // a.b.c.d.
+				 __x._M_data0._M_data);
+		    auto __b = reinterpret_cast<_DataType>( // e.f.g.h.
+				 __vec_zero_pad_to<16>(__x._M_data1._M_concat_data()));
+		    auto __c = __vec_interleave_lo(__a, __b); // ae..bf..
+		    auto __d = __vec_interleave_hi(__a, __b); // cg..dh..
+		    auto __e = __vec_interleave_lo(__c, __d); // aceg....
+		    auto __f = __vec_interleave_hi(__c, __d); // bdfh....
+		    return __vec_interleave_lo(__e, __f); // abcdefgh
+		  }
+	      }
+	    else if constexpr (!_Traits._M_have_ssse3()
+				 && _UAbi::_S_nreg == 2 && (sizeof(__x) == 32)
+				 && sizeof(value_type) == 1 && sizeof(_M_data) == 8)
+	      {
+		if constexpr (is_floating_point_v<_Up>)
+		  return basic_vec(rebind_t<int, basic_vec>(__x))._M_data;
+		else
+		  {
+		    using _TV = __vec_builtin_type_bytes<__canon_value_type, 16>;
+		    _TV __a = reinterpret_cast<_TV>(__x._M_data0._M_data);
+		    _TV __b = reinterpret_cast<_TV>(
+				__vec_zero_pad_to<16>(__x._M_data1._M_concat_data()));
+		    auto __c = __vec_interleave_lo(__a, __b); // aeim....bfjn....
+		    auto __d = __vec_interleave_hi(__a, __b); // cgko....dhlp....
+		    auto __e = __vec_interleave_lo(__c, __d); // acegikmo........
+		    auto __f = __vec_interleave_hi(__c, __d); // bdfhjlnp........
+		    auto __g = __vec_interleave_lo(__e, __f); // abcdefghijklmnop
+		    return __vec_split_lo(__g);
+		  }
+	      }
+	    else if constexpr (!_Traits._M_have_ssse3()
+				 && _UAbi::_S_nreg <= 4 && (sizeof(__x) == 48 || sizeof(__x) == 64)
+				 && sizeof(value_type) == 1 && sizeof(_M_data) == 16)
+	      {
+		if constexpr (is_floating_point_v<_Up>)
+		  return basic_vec(rebind_t<int, basic_vec>(__x))._M_data;
+		else
+		  {
+		    // a...b...c...d...
+		    // .i...j...k...l..
+		    _DataType __a;
+		    _DataType __b;
+		    // e...f...g...h...
+		    // .m...n...o...p..
+		    if constexpr (_UAbi::_S_nreg == 3)
+		      {
+			__a = reinterpret_cast<_DataType>( // ai..bj..ck..dl..
+				(__x._M_data0._M_data0 & 0xff)._M_data
+				  | __vec_zero_pad_to<16>((__x._M_data1 << 8)._M_concat_data()));
+			__b = reinterpret_cast<_DataType>(__x._M_data0._M_data1._M_data);
+		      }
+		    else if constexpr (_UAbi::_S_nreg == 4)
+		      {
+			__a = reinterpret_cast<_DataType>( // ai..bj..ck..dl..
+				(__x._M_data0._M_data0 & 0xff)._M_data
+				  | (__x._M_data1._M_data0 << 8)._M_data);
+			__b = reinterpret_cast<_DataType>( // em..fn..go..hp..
+				(__x._M_data0._M_data1 & 0xff)._M_data
+				  | __vec_zero_pad_to<16>(
+				      (__x._M_data1._M_data1 << 8)._M_concat_data()));
+		      }
+		    else
+		      static_assert(false);
+		    auto __c = __vec_interleave_lo(__a, __b); // aeim....bfjn....
+		    auto __d = __vec_interleave_hi(__a, __b); // cgko....dhlp....
+		    auto __e = __vec_interleave_lo(__c, __d); // acegikmo........
+		    auto __f = __vec_interleave_hi(__c, __d); // bdfhjlnp........
+		    auto __g = __vec_interleave_lo(__e, __f); // abcdefghijklmnop
+		    return __g;
+		  }
+	      }
+#endif
 	    else if constexpr (_UAbi::_S_nreg >= 2)
 	      // __builtin_convertvector (__vec_cast) is inefficient for over-sized inputs.
 	      // Also e.g. vec<float, 12> -> vec<char, 12> (with SSE2) would otherwise emit 4
diff --git a/libstdc++-v3/include/bits/vec_ops.h b/libstdc++-v3/include/bits/vec_ops.h
index e5bf2f1497cd..8cb77bf10969 100644
--- a/libstdc++-v3/include/bits/vec_ops.h
+++ b/libstdc++-v3/include/bits/vec_ops.h
@@ -327,6 +327,33 @@ __is_const_known_equal_to(const auto& __x, const auto& __expect)
     __x86_cvt_f16c(_TV __v);
 #endif
 
+  template <__vec_builtin _TV>
+    [[__gnu__::__always_inline__]]
+    constexpr _TV
+    __vec_interleave_lo(_TV __x, _TV __y)
+    {
+      constexpr int __n = __width_of<_TV>;
+      auto __rotr = [](int __i) consteval {
+	int __b = __i & 1;
+	return (__i >> 1) | (__b * __n);
+      };
+      constexpr auto [...__is] = _IotaArray<__n>;
+      return __builtin_shufflevector(__x, __y, __rotr(__is)...);
+    }
+
+  template <__vec_builtin _TV>
+    [[__gnu__::__always_inline__]]
+    constexpr _TV
+    __vec_interleave_hi(_TV __x, _TV __y)
+    {
+      constexpr int __n = __width_of<_TV>;
+      auto __rotr = [](int __i) consteval {
+	int __b = __i & 1;
+	return (__i >> 1) | (__b * __n);
+      };
+      constexpr auto [...__is] = _IotaArray<__n>;
+      return __builtin_shufflevector(__x, __y, __n / 2 + __rotr(__is)...);
+    }
 
   /** \internal
    * Simple wrapper around __builtin_convertvector to provide static_cast-like syntax.
@@ -358,6 +385,49 @@ __vec_cast(_TV __v)
 	  using _IV = __vec_builtin_type<_Ip, __width_of<_TV>>;
 	  return __vec_cast<_UV>(__vec_cast<_IV>(__v));
 	}
+      if constexpr (!_Traits._M_have_sse4_1() && is_integral_v<_Tp>
+		      && sizeof(_Up) == sizeof(_Tp) * 4)
+	{ // GCC uses scalar conversions unless it can use SSE4.1 instructions
+	  if constexpr (!is_integral_v<_Up>)
+	    {
+	      using _Ip = __integer_from<std::min(sizeof(int), sizeof(_Up))>;
+	      using _IV = __vec_builtin_type<_Ip, __width_of<_TV>>;
+	      return __vec_cast<_UV>(__vec_cast<_IV>(__v));
+	    }
+	  else if constexpr (sizeof(_TV) == 2)
+	    return __vec_split_lo(__vec_cast<__vec_builtin_type<_Up, __width_of<_UV> * 2>>(
+				    __vec_concat(__v, _TV())));
+	  else if constexpr (sizeof(_TV) == 4)
+	    {
+	      auto __v1 = __vec_zero_pad_to_16(__v);
+	      using _V1 = decltype(__v1);
+	      if constexpr (is_signed_v<_Tp>)
+		{
+		  if constexpr (sizeof(_Tp) == 1)
+		    {
+		      auto __v2 = __vec_interleave_lo(__v1, __v1);
+		      auto __v4 = __vec_interleave_lo(__v2, __v2);
+		      return reinterpret_cast<_UV>(__vec_bit_cast<__integer_from<sizeof(_Up)>>(__v4)
+			       >> __CHAR_BIT__ * (sizeof(_Up) - 1));
+		    }
+		  else
+		    {
+		      _V1 __sign = __v1 < 0;
+		      using _V2 = __vec_builtin_type_bytes<__integer_from<sizeof(_Tp) * 2>,
+							   sizeof(__v1)>;
+		      _V2 __v2 = reinterpret_cast<_V2>(__vec_interleave_lo(__v1, __sign));
+		      _V2 __s2 = reinterpret_cast<_V2>(__vec_interleave_lo(__sign, __sign));
+		      return reinterpret_cast<_UV>(__vec_interleave_lo(__v2, __s2));
+		    }
+		}
+	      else
+		{
+		  auto __v2 = __vec_interleave_lo(__v1, _V1());
+		  auto __v4 = __vec_interleave_lo(__v2, _V1());
+		  return reinterpret_cast<_UV>(__v4);
+		}
+	    }
+	}
 #endif
       return __builtin_convertvector(__v, _UV);
     }
diff --git a/libstdc++-v3/testsuite/std/simd/sse2_efficient_cvt_x4.cc b/libstdc++-v3/testsuite/std/simd/sse2_efficient_cvt_x4.cc
new file mode 100644
index 000000000000..aa2d02211aa4
--- /dev/null
+++ b/libstdc++-v3/testsuite/std/simd/sse2_efficient_cvt_x4.cc
@@ -0,0 +1,105 @@
+// { dg-do compile { target { x86_64-*-* && c++26 } } }
+// { dg-require-effective-target c++26 }
+// { dg-options "-march=x86-64 -O2" }
+// { dg-final { check-function-bodies "**" "" } }
+
+#include <simd>
+
+namespace simd = std::simd;
+
+extern "C"
+{
+/*
+** cvt0:
+** .*
+** punpcklbw.*
+** punpcklbw.*
+** psrad.*
+** cvtdq2ps.*
+*/
+simd::vec<float, 4>
+cvt0(simd::vec<signed char, 4> x)
+{ return x; }
+
+/*
+** cvt1:
+** .*
+** punpcklbw.*
+** punpcklbw.*
+** cvtdq2ps.*
+*/
+simd::vec<float, 4>
+cvt1(simd::vec<unsigned char, 4> x)
+{ return x; }
+
+/*
+** cvt2:
+** .*
+** psraw.*
+** punpcklwd.*
+** punpcklwd.*
+** punpckldq.*
+*/
+simd::vec<long long, 2>
+cvt2(simd::vec<short, 2> x)
+{ return x; }
+
+/*
+** cvt3:
+** .*
+** cvttps2dq.*
+** cvttps2dq.*
+** cvttps2dq.*
+** cvttps2dq.*
+** punpck[lh]bw.*
+** punpck[lh]bw.*
+** punpck[lh]bw.*
+** punpck[lh]bw.*
+** punpcklbw.*
+*/
+auto
+cvt3(simd::vec<float, 16> x)
+{ return static_cast<simd::vec<signed char, 16>>(x); }
+
+/*
+** cvt4:
+** .*
+** cvttps2dq.*
+** cvttps2dq.*
+** cvttps2dq.*
+** punpck[lh]bw.*
+** punpck[lh]bw.*
+** punpck[lh]bw.*
+** punpck[lh]bw.*
+** punpcklbw.*
+*/
+auto
+cvt4(simd::vec<float, 12> x)
+{ return static_cast<simd::vec<signed char, 12>>(x); }
+
+/*
+** cvt5:
+** .*
+** punpck[lh]bw.*
+** punpck[lh]bw.*
+** punpck[lh]bw.*
+** punpck[lh]bw.*
+** punpcklbw.*
+*/
+auto
+cvt5(simd::vec<int, 8> x)
+{ return static_cast<simd::vec<signed char, 8>>(x); }
+
+/*
+** cvt6:
+** .*
+** punpck[lh]wd.*
+** punpck[lh]wd.*
+** punpck[lh]wd.*
+** punpck[lh]wd.*
+** punpcklwd.*
+*/
+auto
+cvt6(simd::vec<int, 8> x)
+{ return static_cast<simd::vec<signed short, 8>>(x); }
+}

signature.asc
Description: This is a digitally signed message part.

[PATCH 3/5] libstdc++: Optimize simd conversions on x86-64

Reply via email to