On Sun, Aug 3, 2025 at 10:59 PM Luc Grosheintz <luc.groshei...@gmail.com> wrote:
> Let E denote an multi-dimensional extent; n the rank of E; r = 0, ..., > n; E[i] the i-th extent; and D[k] be the (possibly empty) array of > dynamic extents. > > The two partial products for r = 0, ..., n: > > \prod_{i = 0}^r E[i] (fwd) > \prod_{i = r+1}^n E[i] (rev) > > can be computed as the product of static and dynamic extents. The static > fwd and rev product can be computed at compile time for all values of r. > > Three methods are directly affected by this optimization: > > layout_left::mapping::stride > layout_right::mapping::stride > mdspan::size > > We'll check the generated code (-O2) for all three methods for a generic > (artificially) high-dimensional multi-dimensional extents. > > Consider a generic case: > > using Extents = std::extents<int, 3, 5, dyn, dyn, dyn, 7, dyn>; > > int stride_left(const std::layout_left::mapping<Extents>& m, size_t r) > { return m.stride(r); } > > The code generated prior to this commit: > > 4f0: 66 0f 6f 05 00 00 00 movdqa xmm0,XMMWORD PTR [rip+0x0] # > 4f8 > 4f7: 00 > 4f8: 48 83 c6 01 add rsi,0x1 > 4fc: 48 c7 44 24 e8 ff ff mov QWORD PTR > [rsp-0x18],0xffffffffffffffff > 503: ff ff > 505: 48 8d 04 f5 00 00 00 lea rax,[rsi*8+0x0] > 50c: 00 > 50d: 0f 29 44 24 b8 movaps XMMWORD PTR [rsp-0x48],xmm0 > 512: 66 0f 76 c0 pcmpeqd xmm0,xmm0 > 516: 0f 29 44 24 c8 movaps XMMWORD PTR [rsp-0x38],xmm0 > 51b: 66 0f 6f 05 00 00 00 movdqa xmm0,XMMWORD PTR [rip+0x0] # > 523 > 522: 00 > 523: 0f 29 44 24 d8 movaps XMMWORD PTR [rsp-0x28],xmm0 > 528: 48 83 f8 38 cmp rax,0x38 > 52c: 74 72 je 5a0 <stride_right_E1+0xb0> > 52e: 48 8d 54 04 b8 lea rdx,[rsp+rax*1-0x48] > 533: 4c 8d 4c 24 f0 lea r9,[rsp-0x10] > 538: b8 01 00 00 00 mov eax,0x1 > 53d: 0f 1f 00 nop DWORD PTR [rax] > 540: 48 8b 0a mov rcx,QWORD PTR [rdx] > 543: 49 89 c0 mov r8,rax > 546: 4c 0f af c1 imul r8,rcx > 54a: 48 83 f9 ff cmp rcx,0xffffffffffffffff > 54e: 49 0f 45 c0 cmovne rax,r8 > 552: 48 83 c2 08 add rdx,0x8 > 556: 49 39 d1 cmp r9,rdx > 559: 75 e5 jne 540 <stride_right_E1+0x50> > 55b: 48 85 c0 test rax,rax > 55e: 74 38 je 598 <stride_right_E1+0xa8> > 560: 48 8b 14 f5 00 00 00 mov rdx,QWORD PTR [rsi*8+0x0] > 567: 00 > 568: 48 c1 e2 02 shl rdx,0x2 > 56c: 48 83 fa 10 cmp rdx,0x10 > 570: 74 1e je 590 <stride_right_E1+0xa0> > 572: 48 8d 4f 10 lea rcx,[rdi+0x10] > 576: 48 01 d7 add rdi,rdx > 579: 0f 1f 80 00 00 00 00 nop DWORD PTR [rax+0x0] > 580: 48 63 17 movsxd rdx,DWORD PTR [rdi] > 583: 48 83 c7 04 add rdi,0x4 > 587: 48 0f af c2 imul rax,rdx > 58b: 48 39 f9 cmp rcx,rdi > 58e: 75 f0 jne 580 <stride_right_E1+0x90> > 590: c3 ret > 591: 0f 1f 80 00 00 00 00 nop DWORD PTR [rax+0x0] > 598: c3 ret > 599: 0f 1f 80 00 00 00 00 nop DWORD PTR [rax+0x0] > 5a0: b8 01 00 00 00 mov eax,0x1 > 5a5: eb b9 jmp 560 <stride_right_E1+0x70> > 5a7: 66 0f 1f 84 00 00 00 nop WORD PTR [rax+rax*1+0x0] > 5ae: 00 00 > > which seems to be performing: > > preparatory_work(); > ret = 1 > for(i = 0; i < rank; ++i) > tmp = ret * E[i] > if E[i] != -1 > ret = tmp > for(i = 0; i < rank_dynamic; ++i) > ret *= D[i] > > This commit reduces it down to: > > 270: 48 8b 04 f5 00 00 00 mov rax,QWORD PTR [rsi*8+0x0] > 277: 00 > 278: 31 d2 xor edx,edx > 27a: 48 85 c0 test rax,rax > 27d: 74 33 je 2b2 <stride_right_E1+0x42> > 27f: 48 8b 14 f5 00 00 00 mov rdx,QWORD PTR [rsi*8+0x0] > 286: 00 > 287: 48 c1 e2 02 shl rdx,0x2 > 28b: 48 83 fa 10 cmp rdx,0x10 > 28f: 74 1f je 2b0 <stride_right_E1+0x40> > 291: 48 8d 4f 10 lea rcx,[rdi+0x10] > 295: 48 01 d7 add rdi,rdx > 298: 0f 1f 84 00 00 00 00 nop DWORD PTR [rax+rax*1+0x0] > 29f: 00 > 2a0: 48 63 17 movsxd rdx,DWORD PTR [rdi] > 2a3: 48 83 c7 04 add rdi,0x4 > 2a7: 48 0f af c2 imul rax,rdx > 2ab: 48 39 f9 cmp rcx,rdi > 2ae: 75 f0 jne 2a0 <stride_right_E1+0x30> > 2b0: 89 c2 mov edx,eax > 2b2: 89 d0 mov eax,edx > 2b4: c3 ret > > Loosely speaking this does the following: > > 1. Load the starting position k in the array of dynamic extents; and > return if possible. > 2. Load the partial product of static extents. > 3. Computes the \prod_{i = k}^d D[i] where d is the number of > dynamic extents in a loop. > > It shows that the span used for passing in the dynamic extents is > completely eliminated; and the fact that the product always runs to the > end of the array of dynamic extents is used by the compiler to eliminate > one indirection to determine the end position in the array of dynamic > extents. > > The analogous code is generated for layout_left. > > Next, consider > > using E2 = std::extents<int, 3, 5, dyn, dyn, 7, dyn, 11>; > int size2(const std::mdspan<double, E2>& md) > { return md.size(); } > > on immediately preceding commit the generated code is > > 10: 66 0f 6f 05 00 00 00 movdqa xmm0,XMMWORD PTR [rip+0x0] # 18 > 17: 00 > 18: 49 89 f8 mov r8,rdi > 1b: 48 8d 44 24 b8 lea rax,[rsp-0x48] > 20: 48 c7 44 24 e8 0b 00 mov QWORD PTR [rsp-0x18],0xb > 27: 00 00 > 29: 48 8d 7c 24 f0 lea rdi,[rsp-0x10] > 2e: ba 01 00 00 00 mov edx,0x1 > 33: 0f 29 44 24 b8 movaps XMMWORD PTR [rsp-0x48],xmm0 > 38: 66 0f 76 c0 pcmpeqd xmm0,xmm0 > 3c: 0f 29 44 24 c8 movaps XMMWORD PTR [rsp-0x38],xmm0 > 41: 66 0f 6f 05 00 00 00 movdqa xmm0,XMMWORD PTR [rip+0x0] # 49 > 48: 00 > 49: 0f 29 44 24 d8 movaps XMMWORD PTR [rsp-0x28],xmm0 > 4e: 66 66 2e 0f 1f 84 00 data16 cs nop WORD PTR [rax+rax*1+0x0] > 55: 00 00 00 00 > 59: 0f 1f 80 00 00 00 00 nop DWORD PTR [rax+0x0] > 60: 48 8b 08 mov rcx,QWORD PTR [rax] > 63: 48 89 d6 mov rsi,rdx > 66: 48 0f af f1 imul rsi,rcx > 6a: 48 83 f9 ff cmp rcx,0xffffffffffffffff > 6e: 48 0f 45 d6 cmovne rdx,rsi > 72: 48 83 c0 08 add rax,0x8 > 76: 48 39 c7 cmp rdi,rax > 79: 75 e5 jne 60 <size2+0x50> > 7b: 48 85 d2 test rdx,rdx > 7e: 74 18 je 98 <size2+0x88> > 80: 49 63 00 movsxd rax,DWORD PTR [r8] > 83: 49 63 48 04 movsxd rcx,DWORD PTR [r8+0x4] > 87: 48 0f af c1 imul rax,rcx > 8b: 41 0f af 40 08 imul eax,DWORD PTR [r8+0x8] > 90: 0f af c2 imul eax,edx > 93: c3 ret > 94: 0f 1f 40 00 nop DWORD PTR [rax+0x0] > 98: 31 c0 xor eax,eax > 9a: c3 ret > > which is needlessly long. The current commit reduces it down to: > > 10: 48 63 07 movsxd rax,DWORD PTR [rdi] > 13: 48 63 57 04 movsxd rdx,DWORD PTR [rdi+0x4] > 17: 48 0f af c2 imul rax,rdx > 1b: 0f af 47 08 imul eax,DWORD PTR [rdi+0x8] > 1f: 69 c0 83 04 00 00 imul eax,eax,0x483 > 25: c3 ret > > Which simply computes the product: > > D[0] * D[1] * D[2] * const > > where const is the product of all static extents. Meaning the loop to > compute the product of dynamic extents has been fully unrolled and > all constants are perfectly precomputed. > > The size of the object file described in the previous commit reduces > by 17% from 55.8kB to 46.0kB. > > libstdc++-v3/ChangeLog: > > * include/std/mdspan (__mdspan::__static_prod): New function. > (__mdspan::__fwd_partial_prods): Constexpr array of partial > forward products. > (__mdspan::__fwd_partial_prods): Same for reverse partial > products. > (__mdspan::__static_extents_prod): Delete function. > (__mdspan::__extents_prod): Renamed from __exts_prod and > refactored. > include/std/mdspan (__mdspan::__fwd_prod): Compute as the > product of pre-computed static static and the product of dynamic > extents. > (__mdspan::__rev_prod): Ditto. > > Signed-off-by: Luc Grosheintz <luc.groshei...@gmail.com> > --- > LGTM. > libstdc++-v3/include/std/mdspan | 77 ++++++++++++++++++++++----------- > 1 file changed, 52 insertions(+), 25 deletions(-) > > diff --git a/libstdc++-v3/include/std/mdspan > b/libstdc++-v3/include/std/mdspan > index 7b73df8550e..dc1b44ee35c 100644 > --- a/libstdc++-v3/include/std/mdspan > +++ b/libstdc++-v3/include/std/mdspan > @@ -202,6 +202,41 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION > __static_extents() noexcept > { return _Extents::_S_storage::_S_static_extents(); } > > + template<array _Extents> > + consteval size_t > + __static_prod(size_t __begin, size_t __end) > + { > + size_t __prod = 1; > + for(size_t __i = __begin; __i < __end; ++__i) > + { > + auto __ext = _Extents[__i]; > + __prod *= __ext == dynamic_extent ? size_t(1) : __ext; > + } > + return __prod; > + } > + > + // Pre-compute: \prod_{i = 0}^r _Extents[i], for r = 0,..., n > (exclusive) > + template<array _Extents> > + constexpr auto __fwd_partial_prods = [] consteval > + { > + constexpr size_t __rank = _Extents.size(); > + std::array<size_t, __rank + 1> __ret; > + for(size_t __r = 0; __r < __rank + 1; ++__r) > + __ret[__r] = __static_prod<_Extents>(0, __r); > + return __ret; > + }(); > + > + // Pre-compute: \prod_{i = r+1}^{n-1} _Extents[i] > + template<array _Extents> > + constexpr auto __rev_partial_prods = [] consteval > + { > + constexpr size_t __rank = _Extents.size(); > + std::array<size_t, __rank> __ret; > + for(size_t __r = 0; __r < __rank; ++__r) > + __ret[__r] = __static_prod<_Extents>(__r + 1, __rank); > + return __ret; > + }(); > + > template<typename _Extents> > constexpr span<const typename _Extents::index_type> > __dynamic_extents(const _Extents& __exts, size_t __begin = 0, > @@ -369,47 +404,39 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION > return false; > } > > - constexpr size_t > - __static_extents_prod(const auto& __sta_exts) noexcept > - { > - size_t __ret = 1; > - for (auto __factor : __sta_exts) > - if (__factor != dynamic_extent) > - __ret *= __factor; > - return __ret; > - } > - > template<typename _Extents> > constexpr typename _Extents::index_type > - __exts_prod(const _Extents& __exts, size_t __begin, size_t __end) > noexcept > + __extents_prod(const _Extents& __exts, size_t __sta_prod, size_t > __begin, > + size_t __end) noexcept > { > - using _IndexType = typename _Extents::index_type; > - > - size_t __ret = 1; > - if constexpr (_Extents::rank_dynamic() != _Extents::rank()) > - { > - auto __sta_exts = __static_extents<_Extents>(); > - __ret = __static_extents_prod(span{__sta_exts.data() + __begin, > - __sta_exts.data() + __end}); > - if (__ret == 0) > - return 0; > - } > + if (__sta_prod == 0) > + return 0; > > + size_t __ret = __sta_prod; > if constexpr (_Extents::rank_dynamic() > 0) > for (auto __factor : __dynamic_extents(__exts, __begin, __end)) > __ret *= size_t(__factor); > - return _IndexType(__ret); > + return static_cast<typename _Extents::index_type>(__ret); > } > > template<typename _Extents> > constexpr typename _Extents::index_type > __fwd_prod(const _Extents& __exts, size_t __r) noexcept > - { return __exts_prod(__exts, 0, __r); } > + { > + constexpr auto __sta_exts = __static_extents<_Extents>(); > + size_t __sta_prod = __fwd_partial_prods<__sta_exts>[__r]; > + return __extents_prod(__exts, __sta_prod, 0, __r); > + } > > template<typename _Extents> > constexpr typename _Extents::index_type > __rev_prod(const _Extents& __exts, size_t __r) noexcept > - { return __exts_prod(__exts, __r + 1, __exts.rank()); } > + { > + constexpr auto __sta_exts = __static_extents<_Extents>(); > + constexpr size_t __rank = _Extents::rank(); > + size_t __sta_prod = __rev_partial_prods<__sta_exts>[__r]; > + return __extents_prod(__exts, __sta_prod, __r + 1, __rank); > + } > > template<typename _Extents> > constexpr typename _Extents::index_type > -- > 2.50.0 > >