https://gcc.gnu.org/bugzilla/show_bug.cgi?id=112824
--- Comment #3 from Chris Elrod <elrodc at gmail dot com> --- > I thought I hit the important cases, but my non-minimal example still gets > unnecessary register splits and stack spills, so maybe I missed places, or > perhaps there's another issue. Adding the unroll pragma to the `Vector`'s operator + and *: template<typename T, ptrdiff_t N> [[gnu::always_inline]] constexpr auto operator+(Vector<T,N> x, Vector<T,N> y) -> Vector<T,N> { Vector<T,N> z; #pragma GCC unroll 16 for (ptrdiff_t n = 0; n < Vector<T,N>::L; ++n) z.data[n] = x.data[n] + y.data[n]; return z; } template<typename T, ptrdiff_t N> [[gnu::always_inline]] constexpr auto operator*(Vector<T,N> x, Vector<T,N> y) -> Vector<T,N> { Vector<T,N> z; #pragma GCC unroll 16 for (ptrdiff_t n = 0; n < Vector<T,N>::L; ++n) z.data[n] = x.data[n] * y.data[n]; return z; } template<typename T, ptrdiff_t N> [[gnu::always_inline]] constexpr auto operator+(T x, Vector<T,N> y) -> Vector<T,N> { Vector<T,N> z; #pragma GCC unroll 16 for (ptrdiff_t n = 0; n < Vector<T,N>::L; ++n) z.data[n] = x + y.data[n]; return z; } template<typename T, ptrdiff_t N> [[gnu::always_inline]] constexpr auto operator*(T x, Vector<T,N> y) -> Vector<T,N> { Vector<T,N> z; #pragma GCC unroll 16 for (ptrdiff_t n = 0; n < Vector<T,N>::L; ++n) z.data[n] = x * y.data[n]; return z; } does not improve code generation (still get the same problem), so that's a reproducer for such an issue.