From: Luc Grosheintz <luc.groshei...@gmail.com> Let E denote an multi-dimensional extent; n the rank of E; r = 0, ..., n; E[i] the i-th extent; and D[k] be the (possibly empty) array of dynamic extents.
The two partial products for r = 0, ..., n: \prod_{i = 0}^r E[i] (fwd) \prod_{i = r+1}^n E[i] (rev) can be computed as the product of static and dynamic extents. The static fwd and rev product can be computed at compile time for all values of r. Three methods are directly affected by this optimization: layout_left::mapping::stride layout_right::mapping::stride mdspan::size We'll check the generated code (-O2) for all three methods for a generic (artificially) high-dimensional multi-dimensional extents. Consider a generic case: using Extents = std::extents<int, 3, 5, dyn, dyn, dyn, 7, dyn>; int stride_left(const std::layout_left::mapping<Extents>& m, size_t r) { return m.stride(r); } The code generated prior to this commit: 4f0: 66 0f 6f 05 00 00 00 movdqa xmm0,XMMWORD PTR [rip+0x0] # 4f8 4f7: 00 4f8: 48 83 c6 01 add rsi,0x1 4fc: 48 c7 44 24 e8 ff ff mov QWORD PTR [rsp-0x18],0xffffffffffffffff 503: ff ff 505: 48 8d 04 f5 00 00 00 lea rax,[rsi*8+0x0] 50c: 00 50d: 0f 29 44 24 b8 movaps XMMWORD PTR [rsp-0x48],xmm0 512: 66 0f 76 c0 pcmpeqd xmm0,xmm0 516: 0f 29 44 24 c8 movaps XMMWORD PTR [rsp-0x38],xmm0 51b: 66 0f 6f 05 00 00 00 movdqa xmm0,XMMWORD PTR [rip+0x0] # 523 522: 00 523: 0f 29 44 24 d8 movaps XMMWORD PTR [rsp-0x28],xmm0 528: 48 83 f8 38 cmp rax,0x38 52c: 74 72 je 5a0 <stride_right_E1+0xb0> 52e: 48 8d 54 04 b8 lea rdx,[rsp+rax*1-0x48] 533: 4c 8d 4c 24 f0 lea r9,[rsp-0x10] 538: b8 01 00 00 00 mov eax,0x1 53d: 0f 1f 00 nop DWORD PTR [rax] 540: 48 8b 0a mov rcx,QWORD PTR [rdx] 543: 49 89 c0 mov r8,rax 546: 4c 0f af c1 imul r8,rcx 54a: 48 83 f9 ff cmp rcx,0xffffffffffffffff 54e: 49 0f 45 c0 cmovne rax,r8 552: 48 83 c2 08 add rdx,0x8 556: 49 39 d1 cmp r9,rdx 559: 75 e5 jne 540 <stride_right_E1+0x50> 55b: 48 85 c0 test rax,rax 55e: 74 38 je 598 <stride_right_E1+0xa8> 560: 48 8b 14 f5 00 00 00 mov rdx,QWORD PTR [rsi*8+0x0] 567: 00 568: 48 c1 e2 02 shl rdx,0x2 56c: 48 83 fa 10 cmp rdx,0x10 570: 74 1e je 590 <stride_right_E1+0xa0> 572: 48 8d 4f 10 lea rcx,[rdi+0x10] 576: 48 01 d7 add rdi,rdx 579: 0f 1f 80 00 00 00 00 nop DWORD PTR [rax+0x0] 580: 48 63 17 movsxd rdx,DWORD PTR [rdi] 583: 48 83 c7 04 add rdi,0x4 587: 48 0f af c2 imul rax,rdx 58b: 48 39 f9 cmp rcx,rdi 58e: 75 f0 jne 580 <stride_right_E1+0x90> 590: c3 ret 591: 0f 1f 80 00 00 00 00 nop DWORD PTR [rax+0x0] 598: c3 ret 599: 0f 1f 80 00 00 00 00 nop DWORD PTR [rax+0x0] 5a0: b8 01 00 00 00 mov eax,0x1 5a5: eb b9 jmp 560 <stride_right_E1+0x70> 5a7: 66 0f 1f 84 00 00 00 nop WORD PTR [rax+rax*1+0x0] 5ae: 00 00 which seems to be performing: preparatory_work(); ret = 1 for(i = 0; i < rank; ++i) tmp = ret * E[i] if E[i] != -1 ret = tmp for(i = 0; i < rank_dynamic; ++i) ret *= D[i] This commit reduces it down to: 270: 48 8b 04 f5 00 00 00 mov rax,QWORD PTR [rsi*8+0x0] 277: 00 278: 31 d2 xor edx,edx 27a: 48 85 c0 test rax,rax 27d: 74 33 je 2b2 <stride_right_E1+0x42> 27f: 48 8b 14 f5 00 00 00 mov rdx,QWORD PTR [rsi*8+0x0] 286: 00 287: 48 c1 e2 02 shl rdx,0x2 28b: 48 83 fa 10 cmp rdx,0x10 28f: 74 1f je 2b0 <stride_right_E1+0x40> 291: 48 8d 4f 10 lea rcx,[rdi+0x10] 295: 48 01 d7 add rdi,rdx 298: 0f 1f 84 00 00 00 00 nop DWORD PTR [rax+rax*1+0x0] 29f: 00 2a0: 48 63 17 movsxd rdx,DWORD PTR [rdi] 2a3: 48 83 c7 04 add rdi,0x4 2a7: 48 0f af c2 imul rax,rdx 2ab: 48 39 f9 cmp rcx,rdi 2ae: 75 f0 jne 2a0 <stride_right_E1+0x30> 2b0: 89 c2 mov edx,eax 2b2: 89 d0 mov eax,edx 2b4: c3 ret Loosely speaking this does the following: 1. Load the starting position k in the array of dynamic extents; and return if possible. 2. Load the partial product of static extents. 3. Computes the \prod_{i = k}^d D[i] where d is the number of dynamic extents in a loop. It shows that the span used for passing in the dynamic extents is completely eliminated; and the fact that the product always runs to the end of the array of dynamic extents is used by the compiler to eliminate one indirection to determine the end position in the array of dynamic extents. The analogous code is generated for layout_left. Next, consider using E2 = std::extents<int, 3, 5, dyn, dyn, 7, dyn, 11>; int size2(const std::mdspan<double, E2>& md) { return md.size(); } on immediately preceding commit the generated code is 10: 66 0f 6f 05 00 00 00 movdqa xmm0,XMMWORD PTR [rip+0x0] # 18 17: 00 18: 49 89 f8 mov r8,rdi 1b: 48 8d 44 24 b8 lea rax,[rsp-0x48] 20: 48 c7 44 24 e8 0b 00 mov QWORD PTR [rsp-0x18],0xb 27: 00 00 29: 48 8d 7c 24 f0 lea rdi,[rsp-0x10] 2e: ba 01 00 00 00 mov edx,0x1 33: 0f 29 44 24 b8 movaps XMMWORD PTR [rsp-0x48],xmm0 38: 66 0f 76 c0 pcmpeqd xmm0,xmm0 3c: 0f 29 44 24 c8 movaps XMMWORD PTR [rsp-0x38],xmm0 41: 66 0f 6f 05 00 00 00 movdqa xmm0,XMMWORD PTR [rip+0x0] # 49 48: 00 49: 0f 29 44 24 d8 movaps XMMWORD PTR [rsp-0x28],xmm0 4e: 66 66 2e 0f 1f 84 00 data16 cs nop WORD PTR [rax+rax*1+0x0] 55: 00 00 00 00 59: 0f 1f 80 00 00 00 00 nop DWORD PTR [rax+0x0] 60: 48 8b 08 mov rcx,QWORD PTR [rax] 63: 48 89 d6 mov rsi,rdx 66: 48 0f af f1 imul rsi,rcx 6a: 48 83 f9 ff cmp rcx,0xffffffffffffffff 6e: 48 0f 45 d6 cmovne rdx,rsi 72: 48 83 c0 08 add rax,0x8 76: 48 39 c7 cmp rdi,rax 79: 75 e5 jne 60 <size2+0x50> 7b: 48 85 d2 test rdx,rdx 7e: 74 18 je 98 <size2+0x88> 80: 49 63 00 movsxd rax,DWORD PTR [r8] 83: 49 63 48 04 movsxd rcx,DWORD PTR [r8+0x4] 87: 48 0f af c1 imul rax,rcx 8b: 41 0f af 40 08 imul eax,DWORD PTR [r8+0x8] 90: 0f af c2 imul eax,edx 93: c3 ret 94: 0f 1f 40 00 nop DWORD PTR [rax+0x0] 98: 31 c0 xor eax,eax 9a: c3 ret which is needlessly long. The current commit reduces it down to: 10: 48 63 07 movsxd rax,DWORD PTR [rdi] 13: 48 63 57 04 movsxd rdx,DWORD PTR [rdi+0x4] 17: 48 0f af c2 imul rax,rdx 1b: 0f af 47 08 imul eax,DWORD PTR [rdi+0x8] 1f: 69 c0 83 04 00 00 imul eax,eax,0x483 25: c3 ret Which simply computes the product: D[0] * D[1] * D[2] * const where const is the product of all static extents. Meaning the loop to compute the product of dynamic extents has been fully unrolled and all constants are perfectly precomputed. The size of the object file described in the previous commit reduces by 17% from 55.8kB to 46.0kB. libstdc++-v3/ChangeLog: * include/std/mdspan (__mdspan::__static_prod): New function. (__mdspan::__fwd_partial_prods): Constexpr array of partial forward products. (__mdspan::__fwd_partial_prods): Same for reverse partial products. (__mdspan::__static_extents_prod): Delete function. (__mdspan::__extents_prod): Renamed from __exts_prod and refactored. include/std/mdspan (__mdspan::__fwd_prod): Compute as the product of pre-computed static static and the product of dynamic extents. (__mdspan::__rev_prod): Ditto. Reviewed-by: Tomasz Kamiński <tkami...@redhat.com> Signed-off-by: Luc Grosheintz <luc.groshei...@gmail.com> --- v3 stores reference to __sta_exts instead of making copy. libstdc++-v3/include/std/mdspan | 77 ++++++++++++++++++++++----------- 1 file changed, 52 insertions(+), 25 deletions(-) diff --git a/libstdc++-v3/include/std/mdspan b/libstdc++-v3/include/std/mdspan index 08e6b8ff518..dd53e60237b 100644 --- a/libstdc++-v3/include/std/mdspan +++ b/libstdc++-v3/include/std/mdspan @@ -202,6 +202,41 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION __static_extents() noexcept { return _Extents::_S_storage::_S_static_extents(); } + template<array _Extents> + consteval size_t + __static_prod(size_t __begin, size_t __end) + { + size_t __prod = 1; + for(size_t __i = __begin; __i < __end; ++__i) + { + auto __ext = _Extents[__i]; + __prod *= __ext == dynamic_extent ? size_t(1) : __ext; + } + return __prod; + } + + // Pre-compute: \prod_{i = 0}^r _Extents[i], for r = 0,..., n (exclusive) + template<array _Extents> + constexpr auto __fwd_partial_prods = [] consteval + { + constexpr size_t __rank = _Extents.size(); + std::array<size_t, __rank + 1> __ret; + for(size_t __r = 0; __r < __rank + 1; ++__r) + __ret[__r] = __static_prod<_Extents>(0, __r); + return __ret; + }(); + + // Pre-compute: \prod_{i = r+1}^{n-1} _Extents[i] + template<array _Extents> + constexpr auto __rev_partial_prods = [] consteval + { + constexpr size_t __rank = _Extents.size(); + std::array<size_t, __rank> __ret; + for(size_t __r = 0; __r < __rank; ++__r) + __ret[__r] = __static_prod<_Extents>(__r + 1, __rank); + return __ret; + }(); + template<typename _Extents> constexpr span<const typename _Extents::index_type> __dynamic_extents(const _Extents& __exts, size_t __begin = 0, @@ -369,47 +404,39 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION return false; } - constexpr size_t - __static_extents_prod(const auto& __sta_exts) noexcept - { - size_t __ret = 1; - for (auto __factor : __sta_exts) - if (__factor != dynamic_extent) - __ret *= __factor; - return __ret; - } - template<typename _Extents> constexpr typename _Extents::index_type - __exts_prod(const _Extents& __exts, size_t __begin, size_t __end) noexcept + __extents_prod(const _Extents& __exts, size_t __sta_prod, size_t __begin, + size_t __end) noexcept { - using _IndexType = typename _Extents::index_type; - - size_t __ret = 1; - if constexpr (_Extents::rank_dynamic() != _Extents::rank()) - { - std::span<const size_t> __sta_exts = __static_extents<_Extents>(); - __ret = __static_extents_prod(__sta_exts.subspan(__begin, - __end - __begin)); - if (__ret == 0) - return 0; - } + if (__sta_prod == 0) + return 0; + size_t __ret = __sta_prod; if constexpr (_Extents::rank_dynamic() > 0) for (auto __factor : __dynamic_extents(__exts, __begin, __end)) __ret *= size_t(__factor); - return _IndexType(__ret); + return static_cast<typename _Extents::index_type>(__ret); } template<typename _Extents> constexpr typename _Extents::index_type __fwd_prod(const _Extents& __exts, size_t __r) noexcept - { return __exts_prod(__exts, 0, __r); } + { + constexpr auto& __sta_exts = __static_extents<_Extents>(); + size_t __sta_prod = __fwd_partial_prods<__sta_exts>[__r]; + return __extents_prod(__exts, __sta_prod, 0, __r); + } template<typename _Extents> constexpr typename _Extents::index_type __rev_prod(const _Extents& __exts, size_t __r) noexcept - { return __exts_prod(__exts, __r + 1, __exts.rank()); } + { + constexpr auto& __sta_exts = __static_extents<_Extents>(); + constexpr size_t __rank = _Extents::rank(); + size_t __sta_prod = __rev_partial_prods<__sta_exts>[__r]; + return __extents_prod(__exts, __sta_prod, __r + 1, __rank); + } template<typename _Extents> constexpr typename _Extents::index_type -- 2.49.0