On Sun, Aug 3, 2025 at 10:59 PM Luc Grosheintz <luc.groshei...@gmail.com>
wrote:

> Let E denote an multi-dimensional extent; n the rank of E; r = 0, ...,
> n; E[i] the i-th extent; and D[k] be the (possibly empty) array of
> dynamic extents.
>
> The two partial products for r = 0, ..., n:
>
>   \prod_{i = 0}^r E[i]     (fwd)
>   \prod_{i = r+1}^n E[i]   (rev)
>
> can be computed as the product of static and dynamic extents. The static
> fwd and rev product can be computed at compile time for all values of r.
>
> Three methods are directly affected by this optimization:
>
>   layout_left::mapping::stride
>   layout_right::mapping::stride
>   mdspan::size
>
> We'll check the generated code (-O2) for all three methods for a generic
> (artificially) high-dimensional multi-dimensional extents.
>
> Consider a generic case:
>
>   using Extents = std::extents<int, 3, 5, dyn, dyn, dyn, 7, dyn>;
>
>   int stride_left(const std::layout_left::mapping<Extents>& m, size_t r)
>   { return m.stride(r); }
>
> The code generated prior to this commit:
>
>   4f0:  66 0f 6f 05 00 00 00   movdqa xmm0,XMMWORD PTR [rip+0x0]        #
> 4f8
>   4f7:  00
>   4f8:  48 83 c6 01            add    rsi,0x1
>   4fc:  48 c7 44 24 e8 ff ff   mov    QWORD PTR
> [rsp-0x18],0xffffffffffffffff
>   503:  ff ff
>   505:  48 8d 04 f5 00 00 00   lea    rax,[rsi*8+0x0]
>   50c:  00
>   50d:  0f 29 44 24 b8         movaps XMMWORD PTR [rsp-0x48],xmm0
>   512:  66 0f 76 c0            pcmpeqd xmm0,xmm0
>   516:  0f 29 44 24 c8         movaps XMMWORD PTR [rsp-0x38],xmm0
>   51b:  66 0f 6f 05 00 00 00   movdqa xmm0,XMMWORD PTR [rip+0x0]        #
> 523
>   522:  00
>   523:  0f 29 44 24 d8         movaps XMMWORD PTR [rsp-0x28],xmm0
>   528:  48 83 f8 38            cmp    rax,0x38
>   52c:  74 72                  je     5a0 <stride_right_E1+0xb0>
>   52e:  48 8d 54 04 b8         lea    rdx,[rsp+rax*1-0x48]
>   533:  4c 8d 4c 24 f0         lea    r9,[rsp-0x10]
>   538:  b8 01 00 00 00         mov    eax,0x1
>   53d:  0f 1f 00               nop    DWORD PTR [rax]
>   540:  48 8b 0a               mov    rcx,QWORD PTR [rdx]
>   543:  49 89 c0               mov    r8,rax
>   546:  4c 0f af c1            imul   r8,rcx
>   54a:  48 83 f9 ff            cmp    rcx,0xffffffffffffffff
>   54e:  49 0f 45 c0            cmovne rax,r8
>   552:  48 83 c2 08            add    rdx,0x8
>   556:  49 39 d1               cmp    r9,rdx
>   559:  75 e5                  jne    540 <stride_right_E1+0x50>
>   55b:  48 85 c0               test   rax,rax
>   55e:  74 38                  je     598 <stride_right_E1+0xa8>
>   560:  48 8b 14 f5 00 00 00   mov    rdx,QWORD PTR [rsi*8+0x0]
>   567:  00
>   568:  48 c1 e2 02            shl    rdx,0x2
>   56c:  48 83 fa 10            cmp    rdx,0x10
>   570:  74 1e                  je     590 <stride_right_E1+0xa0>
>   572:  48 8d 4f 10            lea    rcx,[rdi+0x10]
>   576:  48 01 d7               add    rdi,rdx
>   579:  0f 1f 80 00 00 00 00   nop    DWORD PTR [rax+0x0]
>   580:  48 63 17               movsxd rdx,DWORD PTR [rdi]
>   583:  48 83 c7 04            add    rdi,0x4
>   587:  48 0f af c2            imul   rax,rdx
>   58b:  48 39 f9               cmp    rcx,rdi
>   58e:  75 f0                  jne    580 <stride_right_E1+0x90>
>   590:  c3                     ret
>   591:  0f 1f 80 00 00 00 00   nop    DWORD PTR [rax+0x0]
>   598:  c3                     ret
>   599:  0f 1f 80 00 00 00 00   nop    DWORD PTR [rax+0x0]
>   5a0:  b8 01 00 00 00         mov    eax,0x1
>   5a5:  eb b9                  jmp    560 <stride_right_E1+0x70>
>   5a7:  66 0f 1f 84 00 00 00   nop    WORD PTR [rax+rax*1+0x0]
>   5ae:  00 00
>
> which seems to be performing:
>
>   preparatory_work();
>   ret = 1
>   for(i = 0; i < rank; ++i)
>     tmp = ret * E[i]
>     if E[i] != -1
>       ret = tmp
>   for(i = 0; i < rank_dynamic; ++i)
>     ret *= D[i]
>
> This commit reduces it down to:
>
>   270:  48 8b 04 f5 00 00 00   mov    rax,QWORD PTR [rsi*8+0x0]
>   277:  00
>   278:  31 d2                  xor    edx,edx
>   27a:  48 85 c0               test   rax,rax
>   27d:  74 33                  je     2b2 <stride_right_E1+0x42>
>   27f:  48 8b 14 f5 00 00 00   mov    rdx,QWORD PTR [rsi*8+0x0]
>   286:  00
>   287:  48 c1 e2 02            shl    rdx,0x2
>   28b:  48 83 fa 10            cmp    rdx,0x10
>   28f:  74 1f                  je     2b0 <stride_right_E1+0x40>
>   291:  48 8d 4f 10            lea    rcx,[rdi+0x10]
>   295:  48 01 d7               add    rdi,rdx
>   298:  0f 1f 84 00 00 00 00   nop    DWORD PTR [rax+rax*1+0x0]
>   29f:  00
>   2a0:  48 63 17               movsxd rdx,DWORD PTR [rdi]
>   2a3:  48 83 c7 04            add    rdi,0x4
>   2a7:  48 0f af c2            imul   rax,rdx
>   2ab:  48 39 f9               cmp    rcx,rdi
>   2ae:  75 f0                  jne    2a0 <stride_right_E1+0x30>
>   2b0:  89 c2                  mov    edx,eax
>   2b2:  89 d0                  mov    eax,edx
>   2b4:  c3                     ret
>
> Loosely speaking this does the following:
>
>   1. Load the starting position k in the array of dynamic extents; and
>      return if possible.
>   2. Load the partial product of static extents.
>   3. Computes the \prod_{i = k}^d D[i] where d is the number of
>   dynamic extents in a loop.
>
> It shows that the span used for passing in the dynamic extents is
> completely eliminated; and the fact that the product always runs to the
> end of the array of dynamic extents is used by the compiler to eliminate
> one indirection to determine the end position in the array of dynamic
> extents.
>
> The analogous code is generated for layout_left.
>
> Next, consider
>
>   using E2 = std::extents<int, 3, 5, dyn, dyn, 7, dyn, 11>;
>   int size2(const std::mdspan<double, E2>& md)
>   { return md.size(); }
>
> on immediately preceding commit the generated code is
>
>   10:  66 0f 6f 05 00 00 00   movdqa xmm0,XMMWORD PTR [rip+0x0]        # 18
>   17:  00
>   18:  49 89 f8               mov    r8,rdi
>   1b:  48 8d 44 24 b8         lea    rax,[rsp-0x48]
>   20:  48 c7 44 24 e8 0b 00   mov    QWORD PTR [rsp-0x18],0xb
>   27:  00 00
>   29:  48 8d 7c 24 f0         lea    rdi,[rsp-0x10]
>   2e:  ba 01 00 00 00         mov    edx,0x1
>   33:  0f 29 44 24 b8         movaps XMMWORD PTR [rsp-0x48],xmm0
>   38:  66 0f 76 c0            pcmpeqd xmm0,xmm0
>   3c:  0f 29 44 24 c8         movaps XMMWORD PTR [rsp-0x38],xmm0
>   41:  66 0f 6f 05 00 00 00   movdqa xmm0,XMMWORD PTR [rip+0x0]        # 49
>   48:  00
>   49:  0f 29 44 24 d8         movaps XMMWORD PTR [rsp-0x28],xmm0
>   4e:  66 66 2e 0f 1f 84 00   data16 cs nop WORD PTR [rax+rax*1+0x0]
>   55:  00 00 00 00
>   59:  0f 1f 80 00 00 00 00   nop    DWORD PTR [rax+0x0]
>   60:  48 8b 08               mov    rcx,QWORD PTR [rax]
>   63:  48 89 d6               mov    rsi,rdx
>   66:  48 0f af f1            imul   rsi,rcx
>   6a:  48 83 f9 ff            cmp    rcx,0xffffffffffffffff
>   6e:  48 0f 45 d6            cmovne rdx,rsi
>   72:  48 83 c0 08            add    rax,0x8
>   76:  48 39 c7               cmp    rdi,rax
>   79:  75 e5                  jne    60 <size2+0x50>
>   7b:  48 85 d2               test   rdx,rdx
>   7e:  74 18                  je     98 <size2+0x88>
>   80:  49 63 00               movsxd rax,DWORD PTR [r8]
>   83:  49 63 48 04            movsxd rcx,DWORD PTR [r8+0x4]
>   87:  48 0f af c1            imul   rax,rcx
>   8b:  41 0f af 40 08         imul   eax,DWORD PTR [r8+0x8]
>   90:  0f af c2               imul   eax,edx
>   93:  c3                     ret
>   94:  0f 1f 40 00            nop    DWORD PTR [rax+0x0]
>   98:  31 c0                  xor    eax,eax
>   9a:  c3                     ret
>
> which is needlessly long. The current commit reduces it down to:
>
>   10:  48 63 07               movsxd rax,DWORD PTR [rdi]
>   13:  48 63 57 04            movsxd rdx,DWORD PTR [rdi+0x4]
>   17:  48 0f af c2            imul   rax,rdx
>   1b:  0f af 47 08            imul   eax,DWORD PTR [rdi+0x8]
>   1f:  69 c0 83 04 00 00      imul   eax,eax,0x483
>   25:  c3                     ret
>
> Which simply computes the product:
>
>   D[0] * D[1] * D[2] * const
>
> where const is the product of all static extents. Meaning the loop to
> compute the product of dynamic extents has been fully unrolled and
> all constants are perfectly precomputed.
>
> The size of the object file described in the previous commit reduces
> by 17% from 55.8kB to 46.0kB.
>
> libstdc++-v3/ChangeLog:
>
>         * include/std/mdspan (__mdspan::__static_prod): New function.
>         (__mdspan::__fwd_partial_prods): Constexpr array of partial
>         forward products.
>         (__mdspan::__fwd_partial_prods): Same for reverse partial
>         products.
>         (__mdspan::__static_extents_prod): Delete function.
>         (__mdspan::__extents_prod): Renamed from __exts_prod and
> refactored.
>         include/std/mdspan (__mdspan::__fwd_prod): Compute as the
>         product of pre-computed static static and the product of dynamic
>         extents.
>         (__mdspan::__rev_prod): Ditto.
>
> Signed-off-by: Luc Grosheintz <luc.groshei...@gmail.com>
> ---
>
LGTM.

>  libstdc++-v3/include/std/mdspan | 77 ++++++++++++++++++++++-----------
>  1 file changed, 52 insertions(+), 25 deletions(-)
>
> diff --git a/libstdc++-v3/include/std/mdspan
> b/libstdc++-v3/include/std/mdspan
> index 7b73df8550e..dc1b44ee35c 100644
> --- a/libstdc++-v3/include/std/mdspan
> +++ b/libstdc++-v3/include/std/mdspan
> @@ -202,6 +202,41 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
>        __static_extents() noexcept
>        { return _Extents::_S_storage::_S_static_extents(); }
>
> +    template<array _Extents>
> +      consteval size_t
> +      __static_prod(size_t __begin, size_t __end)
> +      {
> +       size_t __prod = 1;
> +       for(size_t __i = __begin; __i < __end; ++__i)
> +         {
> +           auto __ext = _Extents[__i];
> +           __prod *= __ext == dynamic_extent ? size_t(1) : __ext;
> +         }
> +       return __prod;
> +      }
> +
> +    // Pre-compute: \prod_{i = 0}^r _Extents[i], for r = 0,..., n
> (exclusive)
> +    template<array _Extents>
> +      constexpr auto __fwd_partial_prods = [] consteval
> +       {
> +         constexpr size_t __rank = _Extents.size();
> +         std::array<size_t, __rank + 1> __ret;
> +         for(size_t __r = 0; __r < __rank + 1; ++__r)
> +           __ret[__r] = __static_prod<_Extents>(0, __r);
> +         return __ret;
> +       }();
> +
> +    // Pre-compute: \prod_{i = r+1}^{n-1} _Extents[i]
> +    template<array _Extents>
> +      constexpr auto __rev_partial_prods = [] consteval
> +       {
> +         constexpr size_t __rank = _Extents.size();
> +         std::array<size_t, __rank> __ret;
> +         for(size_t __r = 0; __r < __rank; ++__r)
> +           __ret[__r] = __static_prod<_Extents>(__r + 1, __rank);
> +         return __ret;
> +       }();
> +
>      template<typename _Extents>
>        constexpr span<const typename _Extents::index_type>
>        __dynamic_extents(const _Extents& __exts, size_t __begin = 0,
> @@ -369,47 +404,39 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
>           return false;
>        }
>
> -    constexpr size_t
> -    __static_extents_prod(const auto& __sta_exts) noexcept
> -    {
> -      size_t __ret = 1;
> -      for (auto __factor : __sta_exts)
> -       if (__factor != dynamic_extent)
> -         __ret *= __factor;
> -      return __ret;
> -    }
> -
>      template<typename _Extents>
>        constexpr typename _Extents::index_type
> -      __exts_prod(const _Extents& __exts, size_t __begin, size_t __end)
> noexcept
> +      __extents_prod(const _Extents& __exts, size_t __sta_prod, size_t
> __begin,
> +                    size_t __end) noexcept
>        {
> -       using _IndexType = typename _Extents::index_type;
> -
> -       size_t __ret = 1;
> -       if constexpr (_Extents::rank_dynamic() != _Extents::rank())
> -         {
> -           auto __sta_exts = __static_extents<_Extents>();
> -           __ret = __static_extents_prod(span{__sta_exts.data() + __begin,
> -                                              __sta_exts.data() + __end});
> -           if (__ret == 0)
> -             return 0;
> -         }
> +       if (__sta_prod == 0)
> +         return 0;
>
> +       size_t __ret = __sta_prod;
>         if constexpr (_Extents::rank_dynamic() > 0)
>           for (auto __factor : __dynamic_extents(__exts, __begin, __end))
>             __ret *= size_t(__factor);
> -       return _IndexType(__ret);
> +       return static_cast<typename _Extents::index_type>(__ret);
>        }
>
>      template<typename _Extents>
>        constexpr typename _Extents::index_type
>        __fwd_prod(const _Extents& __exts, size_t __r) noexcept
> -      { return __exts_prod(__exts, 0, __r); }
> +      {
> +       constexpr auto __sta_exts = __static_extents<_Extents>();
> +       size_t __sta_prod = __fwd_partial_prods<__sta_exts>[__r];
> +       return __extents_prod(__exts, __sta_prod, 0, __r);
> +      }
>
>      template<typename _Extents>
>        constexpr typename _Extents::index_type
>        __rev_prod(const _Extents& __exts, size_t __r) noexcept
> -      { return __exts_prod(__exts, __r + 1, __exts.rank()); }
> +      {
> +       constexpr auto __sta_exts = __static_extents<_Extents>();
> +       constexpr size_t __rank = _Extents::rank();
> +       size_t __sta_prod = __rev_partial_prods<__sta_exts>[__r];
> +       return __extents_prod(__exts, __sta_prod, __r + 1, __rank);
> +      }
>
>      template<typename _Extents>
>        constexpr typename _Extents::index_type
> --
> 2.50.0
>
>

Reply via email to