From: Luc Grosheintz <luc.groshei...@gmail.com>

Let E denote an multi-dimensional extent; n the rank of E; r = 0, ...,
n; E[i] the i-th extent; and D[k] be the (possibly empty) array of
dynamic extents.

The two partial products for r = 0, ..., n:

  \prod_{i = 0}^r E[i]     (fwd)
  \prod_{i = r+1}^n E[i]   (rev)

can be computed as the product of static and dynamic extents. The static
fwd and rev product can be computed at compile time for all values of r.

Three methods are directly affected by this optimization:

  layout_left::mapping::stride
  layout_right::mapping::stride
  mdspan::size

We'll check the generated code (-O2) for all three methods for a generic
(artificially) high-dimensional multi-dimensional extents.

Consider a generic case:

  using Extents = std::extents<int, 3, 5, dyn, dyn, dyn, 7, dyn>;

  int stride_left(const std::layout_left::mapping<Extents>& m, size_t r)
  { return m.stride(r); }

The code generated prior to this commit:

  4f0:  66 0f 6f 05 00 00 00   movdqa xmm0,XMMWORD PTR [rip+0x0]        # 4f8
  4f7:  00
  4f8:  48 83 c6 01            add    rsi,0x1
  4fc:  48 c7 44 24 e8 ff ff   mov    QWORD PTR [rsp-0x18],0xffffffffffffffff
  503:  ff ff
  505:  48 8d 04 f5 00 00 00   lea    rax,[rsi*8+0x0]
  50c:  00
  50d:  0f 29 44 24 b8         movaps XMMWORD PTR [rsp-0x48],xmm0
  512:  66 0f 76 c0            pcmpeqd xmm0,xmm0
  516:  0f 29 44 24 c8         movaps XMMWORD PTR [rsp-0x38],xmm0
  51b:  66 0f 6f 05 00 00 00   movdqa xmm0,XMMWORD PTR [rip+0x0]        # 523
  522:  00
  523:  0f 29 44 24 d8         movaps XMMWORD PTR [rsp-0x28],xmm0
  528:  48 83 f8 38            cmp    rax,0x38
  52c:  74 72                  je     5a0 <stride_right_E1+0xb0>
  52e:  48 8d 54 04 b8         lea    rdx,[rsp+rax*1-0x48]
  533:  4c 8d 4c 24 f0         lea    r9,[rsp-0x10]
  538:  b8 01 00 00 00         mov    eax,0x1
  53d:  0f 1f 00               nop    DWORD PTR [rax]
  540:  48 8b 0a               mov    rcx,QWORD PTR [rdx]
  543:  49 89 c0               mov    r8,rax
  546:  4c 0f af c1            imul   r8,rcx
  54a:  48 83 f9 ff            cmp    rcx,0xffffffffffffffff
  54e:  49 0f 45 c0            cmovne rax,r8
  552:  48 83 c2 08            add    rdx,0x8
  556:  49 39 d1               cmp    r9,rdx
  559:  75 e5                  jne    540 <stride_right_E1+0x50>
  55b:  48 85 c0               test   rax,rax
  55e:  74 38                  je     598 <stride_right_E1+0xa8>
  560:  48 8b 14 f5 00 00 00   mov    rdx,QWORD PTR [rsi*8+0x0]
  567:  00
  568:  48 c1 e2 02            shl    rdx,0x2
  56c:  48 83 fa 10            cmp    rdx,0x10
  570:  74 1e                  je     590 <stride_right_E1+0xa0>
  572:  48 8d 4f 10            lea    rcx,[rdi+0x10]
  576:  48 01 d7               add    rdi,rdx
  579:  0f 1f 80 00 00 00 00   nop    DWORD PTR [rax+0x0]
  580:  48 63 17               movsxd rdx,DWORD PTR [rdi]
  583:  48 83 c7 04            add    rdi,0x4
  587:  48 0f af c2            imul   rax,rdx
  58b:  48 39 f9               cmp    rcx,rdi
  58e:  75 f0                  jne    580 <stride_right_E1+0x90>
  590:  c3                     ret
  591:  0f 1f 80 00 00 00 00   nop    DWORD PTR [rax+0x0]
  598:  c3                     ret
  599:  0f 1f 80 00 00 00 00   nop    DWORD PTR [rax+0x0]
  5a0:  b8 01 00 00 00         mov    eax,0x1
  5a5:  eb b9                  jmp    560 <stride_right_E1+0x70>
  5a7:  66 0f 1f 84 00 00 00   nop    WORD PTR [rax+rax*1+0x0]
  5ae:  00 00

which seems to be performing:

  preparatory_work();
  ret = 1
  for(i = 0; i < rank; ++i)
    tmp = ret * E[i]
    if E[i] != -1
      ret = tmp
  for(i = 0; i < rank_dynamic; ++i)
    ret *= D[i]

This commit reduces it down to:

  270:  48 8b 04 f5 00 00 00   mov    rax,QWORD PTR [rsi*8+0x0]
  277:  00
  278:  31 d2                  xor    edx,edx
  27a:  48 85 c0               test   rax,rax
  27d:  74 33                  je     2b2 <stride_right_E1+0x42>
  27f:  48 8b 14 f5 00 00 00   mov    rdx,QWORD PTR [rsi*8+0x0]
  286:  00
  287:  48 c1 e2 02            shl    rdx,0x2
  28b:  48 83 fa 10            cmp    rdx,0x10
  28f:  74 1f                  je     2b0 <stride_right_E1+0x40>
  291:  48 8d 4f 10            lea    rcx,[rdi+0x10]
  295:  48 01 d7               add    rdi,rdx
  298:  0f 1f 84 00 00 00 00   nop    DWORD PTR [rax+rax*1+0x0]
  29f:  00
  2a0:  48 63 17               movsxd rdx,DWORD PTR [rdi]
  2a3:  48 83 c7 04            add    rdi,0x4
  2a7:  48 0f af c2            imul   rax,rdx
  2ab:  48 39 f9               cmp    rcx,rdi
  2ae:  75 f0                  jne    2a0 <stride_right_E1+0x30>
  2b0:  89 c2                  mov    edx,eax
  2b2:  89 d0                  mov    eax,edx
  2b4:  c3                     ret

Loosely speaking this does the following:

  1. Load the starting position k in the array of dynamic extents; and
     return if possible.
  2. Load the partial product of static extents.
  3. Computes the \prod_{i = k}^d D[i] where d is the number of
  dynamic extents in a loop.

It shows that the span used for passing in the dynamic extents is
completely eliminated; and the fact that the product always runs to the
end of the array of dynamic extents is used by the compiler to eliminate
one indirection to determine the end position in the array of dynamic
extents.

The analogous code is generated for layout_left.

Next, consider

  using E2 = std::extents<int, 3, 5, dyn, dyn, 7, dyn, 11>;
  int size2(const std::mdspan<double, E2>& md)
  { return md.size(); }

on immediately preceding commit the generated code is

  10:  66 0f 6f 05 00 00 00   movdqa xmm0,XMMWORD PTR [rip+0x0]        # 18
  17:  00
  18:  49 89 f8               mov    r8,rdi
  1b:  48 8d 44 24 b8         lea    rax,[rsp-0x48]
  20:  48 c7 44 24 e8 0b 00   mov    QWORD PTR [rsp-0x18],0xb
  27:  00 00
  29:  48 8d 7c 24 f0         lea    rdi,[rsp-0x10]
  2e:  ba 01 00 00 00         mov    edx,0x1
  33:  0f 29 44 24 b8         movaps XMMWORD PTR [rsp-0x48],xmm0
  38:  66 0f 76 c0            pcmpeqd xmm0,xmm0
  3c:  0f 29 44 24 c8         movaps XMMWORD PTR [rsp-0x38],xmm0
  41:  66 0f 6f 05 00 00 00   movdqa xmm0,XMMWORD PTR [rip+0x0]        # 49
  48:  00
  49:  0f 29 44 24 d8         movaps XMMWORD PTR [rsp-0x28],xmm0
  4e:  66 66 2e 0f 1f 84 00   data16 cs nop WORD PTR [rax+rax*1+0x0]
  55:  00 00 00 00
  59:  0f 1f 80 00 00 00 00   nop    DWORD PTR [rax+0x0]
  60:  48 8b 08               mov    rcx,QWORD PTR [rax]
  63:  48 89 d6               mov    rsi,rdx
  66:  48 0f af f1            imul   rsi,rcx
  6a:  48 83 f9 ff            cmp    rcx,0xffffffffffffffff
  6e:  48 0f 45 d6            cmovne rdx,rsi
  72:  48 83 c0 08            add    rax,0x8
  76:  48 39 c7               cmp    rdi,rax
  79:  75 e5                  jne    60 <size2+0x50>
  7b:  48 85 d2               test   rdx,rdx
  7e:  74 18                  je     98 <size2+0x88>
  80:  49 63 00               movsxd rax,DWORD PTR [r8]
  83:  49 63 48 04            movsxd rcx,DWORD PTR [r8+0x4]
  87:  48 0f af c1            imul   rax,rcx
  8b:  41 0f af 40 08         imul   eax,DWORD PTR [r8+0x8]
  90:  0f af c2               imul   eax,edx
  93:  c3                     ret
  94:  0f 1f 40 00            nop    DWORD PTR [rax+0x0]
  98:  31 c0                  xor    eax,eax
  9a:  c3                     ret

which is needlessly long. The current commit reduces it down to:

  10:  48 63 07               movsxd rax,DWORD PTR [rdi]
  13:  48 63 57 04            movsxd rdx,DWORD PTR [rdi+0x4]
  17:  48 0f af c2            imul   rax,rdx
  1b:  0f af 47 08            imul   eax,DWORD PTR [rdi+0x8]
  1f:  69 c0 83 04 00 00      imul   eax,eax,0x483
  25:  c3                     ret

Which simply computes the product:

  D[0] * D[1] * D[2] * const

where const is the product of all static extents. Meaning the loop to
compute the product of dynamic extents has been fully unrolled and
all constants are perfectly precomputed.

The size of the object file described in the previous commit reduces
by 17% from 55.8kB to 46.0kB.

libstdc++-v3/ChangeLog:

        * include/std/mdspan (__mdspan::__static_prod): New function.
        (__mdspan::__fwd_partial_prods): Constexpr array of partial
        forward products.
        (__mdspan::__fwd_partial_prods): Same for reverse partial
        products.
        (__mdspan::__static_extents_prod): Delete function.
        (__mdspan::__extents_prod): Renamed from __exts_prod and refactored.
        include/std/mdspan (__mdspan::__fwd_prod): Compute as the
        product of pre-computed static static and the product of dynamic
        extents.
        (__mdspan::__rev_prod): Ditto.

Reviewed-by: Tomasz Kamiński <tkami...@redhat.com>
Signed-off-by: Luc Grosheintz <luc.groshei...@gmail.com>
---
v3 stores reference to __sta_exts instead of making copy.

 libstdc++-v3/include/std/mdspan | 77 ++++++++++++++++++++++-----------
 1 file changed, 52 insertions(+), 25 deletions(-)

diff --git a/libstdc++-v3/include/std/mdspan b/libstdc++-v3/include/std/mdspan
index 08e6b8ff518..dd53e60237b 100644
--- a/libstdc++-v3/include/std/mdspan
+++ b/libstdc++-v3/include/std/mdspan
@@ -202,6 +202,41 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
       __static_extents() noexcept
       { return _Extents::_S_storage::_S_static_extents(); }
 
+    template<array _Extents>
+      consteval size_t
+      __static_prod(size_t __begin, size_t __end)
+      {
+       size_t __prod = 1;
+       for(size_t __i = __begin; __i < __end; ++__i)
+         {
+           auto __ext = _Extents[__i];
+           __prod *= __ext == dynamic_extent ? size_t(1) : __ext;
+         }
+       return __prod;
+      }
+
+    // Pre-compute: \prod_{i = 0}^r _Extents[i], for r = 0,..., n (exclusive)
+    template<array _Extents>
+      constexpr auto __fwd_partial_prods = [] consteval
+       {
+         constexpr size_t __rank = _Extents.size();
+         std::array<size_t, __rank + 1> __ret;
+         for(size_t __r = 0; __r < __rank + 1; ++__r)
+           __ret[__r] = __static_prod<_Extents>(0, __r);
+         return __ret;
+       }();
+
+    // Pre-compute: \prod_{i = r+1}^{n-1} _Extents[i]
+    template<array _Extents>
+      constexpr auto __rev_partial_prods = [] consteval
+       {
+         constexpr size_t __rank = _Extents.size();
+         std::array<size_t, __rank> __ret;
+         for(size_t __r = 0; __r < __rank; ++__r)
+           __ret[__r] = __static_prod<_Extents>(__r + 1, __rank);
+         return __ret;
+       }();
+
     template<typename _Extents>
       constexpr span<const typename _Extents::index_type>
       __dynamic_extents(const _Extents& __exts, size_t __begin = 0,
@@ -369,47 +404,39 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
          return false;
       }
 
-    constexpr size_t
-    __static_extents_prod(const auto& __sta_exts) noexcept
-    {
-      size_t __ret = 1;
-      for (auto __factor : __sta_exts)
-       if (__factor != dynamic_extent)
-         __ret *= __factor;
-      return __ret;
-    }
-
     template<typename _Extents>
       constexpr typename _Extents::index_type
-      __exts_prod(const _Extents& __exts, size_t __begin, size_t __end) 
noexcept
+      __extents_prod(const _Extents& __exts, size_t __sta_prod, size_t __begin,
+                    size_t __end) noexcept
       {
-       using _IndexType = typename _Extents::index_type;
-
-       size_t __ret = 1;
-       if constexpr (_Extents::rank_dynamic() != _Extents::rank())
-         {
-           std::span<const size_t> __sta_exts = __static_extents<_Extents>();
-           __ret = __static_extents_prod(__sta_exts.subspan(__begin,
-                                                            __end - __begin));
-           if (__ret == 0)
-             return 0;
-         }
+       if (__sta_prod == 0)
+         return 0;
 
+       size_t __ret = __sta_prod;
        if constexpr (_Extents::rank_dynamic() > 0)
          for (auto __factor : __dynamic_extents(__exts, __begin, __end))
            __ret *= size_t(__factor);
-       return _IndexType(__ret);
+       return static_cast<typename _Extents::index_type>(__ret);
       }
 
     template<typename _Extents>
       constexpr typename _Extents::index_type
       __fwd_prod(const _Extents& __exts, size_t __r) noexcept
-      { return __exts_prod(__exts, 0, __r); }
+      {
+       constexpr auto& __sta_exts = __static_extents<_Extents>();
+       size_t __sta_prod = __fwd_partial_prods<__sta_exts>[__r];
+       return __extents_prod(__exts, __sta_prod, 0, __r);
+      }
 
     template<typename _Extents>
       constexpr typename _Extents::index_type
       __rev_prod(const _Extents& __exts, size_t __r) noexcept
-      { return __exts_prod(__exts, __r + 1, __exts.rank()); }
+      {
+       constexpr auto& __sta_exts = __static_extents<_Extents>();
+       constexpr size_t __rank = _Extents::rank();
+       size_t __sta_prod = __rev_partial_prods<__sta_exts>[__r];
+       return __extents_prod(__exts, __sta_prod, __r + 1, __rank);
+      }
 
     template<typename _Extents>
       constexpr typename _Extents::index_type
-- 
2.49.0

Reply via email to