From: Luc Grosheintz <luc.groshei...@gmail.com>

In both fully static and dynamic extents the comparison
  static_extent(i) == dynamic_extent
is known at compile time. As a result, extents::extent doesn't
need to perform the check at runtime.

An illustrative example is:

  using E = std::extents<int, 3, 5, 7, 11, 13, 17>;
  int required_span_size(const typename Layout::mapping<E>& m)
  { return m.required_span_size(); }

Prior to this commit the generated code (on -O2) is:

 2a0:  b9 01 00 00 00         mov    ecx,0x1
 2a5:  31 d2                  xor    edx,edx
 2a7:  66 66 2e 0f 1f 84 00   data16 cs nop WORD PTR [rax+rax*1+0x0]
 2ae:  00 00 00 00
 2b2:  66 66 2e 0f 1f 84 00   data16 cs nop WORD PTR [rax+rax*1+0x0]
 2b9:  00 00 00 00
 2bd:  0f 1f 00               nop    DWORD PTR [rax]
 2c0:  48 8b 04 d5 00 00 00   mov    rax,QWORD PTR [rdx*8+0x0]
 2c7:  00
 2c8:  48 83 f8 ff            cmp    rax,0xffffffffffffffff
 2cc:  0f 84 00 00 00 00      je     2d2 <required_span_size_6d_static+0x32>
 2d2:  83 e8 01               sub    eax,0x1
 2d5:  0f af 04 97            imul   eax,DWORD PTR [rdi+rdx*4]
 2d9:  48 83 c2 01            add    rdx,0x1
 2dd:  01 c1                  add    ecx,eax
 2df:  48 83 fa 06            cmp    rdx,0x6
 2e3:  75 db                  jne    2c0 <required_span_size_6d_static+0x20>
 2e5:  89 c8                  mov    eax,ecx
 2e7:  c3                     ret

which is a scalar loop, and notably includes the check

  308:  48 83 f8 ff            cmp    rax,0xffffffffffffffff

to assert that the static extent is indeed not -1. Note, that on -O3 the
optimizer eliminates the comparison; and generates a sequence of scalar
operations: lea, shl, add and mov. The aim of this commit is to
eliminate this comparison also for -O2. With the optimization applied we
get:

  2e0:  f3 0f 6f 0f            movdqu xmm1,XMMWORD PTR [rdi]
  2e4:  66 0f 6f 15 00 00 00   movdqa xmm2,XMMWORD PTR [rip+0x0]
  2eb:  00
  2ec:  8b 57 10               mov    edx,DWORD PTR [rdi+0x10]
  2ef:  66 0f 6f c1            movdqa xmm0,xmm1
  2f3:  66 0f 73 d1 20         psrlq  xmm1,0x20
  2f8:  66 0f f4 c2            pmuludq xmm0,xmm2
  2fc:  66 0f 73 d2 20         psrlq  xmm2,0x20
  301:  8d 14 52               lea    edx,[rdx+rdx*2]
  304:  66 0f f4 ca            pmuludq xmm1,xmm2
  308:  66 0f 70 c0 08         pshufd xmm0,xmm0,0x8
  30d:  66 0f 70 c9 08         pshufd xmm1,xmm1,0x8
  312:  66 0f 62 c1            punpckldq xmm0,xmm1
  316:  66 0f 6f c8            movdqa xmm1,xmm0
  31a:  66 0f 73 d9 08         psrldq xmm1,0x8
  31f:  66 0f fe c1            paddd  xmm0,xmm1
  323:  66 0f 6f c8            movdqa xmm1,xmm0
  327:  66 0f 73 d9 04         psrldq xmm1,0x4
  32c:  66 0f fe c1            paddd  xmm0,xmm1
  330:  66 0f 7e c0            movd   eax,xmm0
  334:  8d 54 90 01            lea    edx,[rax+rdx*4+0x1]
  338:  8b 47 14               mov    eax,DWORD PTR [rdi+0x14]
  33b:  c1 e0 04               shl    eax,0x4
  33e:  01 d0                  add    eax,edx
  340:  c3                     ret

Which shows eliminating the trivial comparison, unlocks a new set of
optimizations, i.e. SIMD-vectorization. In particular, the loop has been
vectorized by loading the first four constants from aligned memory; the
first four strides from non-aligned memory, then computes the product
and reduction. It interleaves the above with computing 1 + 12*S[4] +
16*S[5] (as scalar operations) and then finishes the reduction.

A similar effect can be observed for fully dynamic extents.

libstdc++-v3/ChangeLog:

        * include/std/mdspan (__mdspan::__all_static): New function.
        (__mdspan::_StaticExtents::_S_is_dyn): Inline and eliminate.
        (__mdspan::_ExtentsStorage::_S_is_dynamic): New method.
        (__mdspan::_ExtentsStorage::_M_extent): Use _S_is_dynamic.

Reviewed-by: Tomasz Kamiński <tkami...@redhat.com>
Signed-off-by: Luc Grosheintz <luc.groshei...@gmail.com>
---
v3 changes __all_static to use span.

 libstdc++-v3/include/std/mdspan | 35 +++++++++++++++++++++++----------
 1 file changed, 25 insertions(+), 10 deletions(-)

diff --git a/libstdc++-v3/include/std/mdspan b/libstdc++-v3/include/std/mdspan
index a9168af4acc..4b271116a02 100644
--- a/libstdc++-v3/include/std/mdspan
+++ b/libstdc++-v3/include/std/mdspan
@@ -49,6 +49,15 @@ namespace std _GLIBCXX_VISIBILITY(default)
 _GLIBCXX_BEGIN_NAMESPACE_VERSION
   namespace __mdspan
   {
+    consteval bool
+    __all_static(std::span<const size_t> __extents)
+    {
+      for(auto __ext : __extents)
+       if (__ext == dynamic_extent)
+         return false;
+      return true;
+    }
+
     consteval bool
     __all_dynamic(std::span<const size_t> __extents)
     {
@@ -62,10 +71,6 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
       class _StaticExtents
       {
       public:
-       static consteval bool
-       _S_is_dyn(size_t __ext) noexcept
-       { return __ext == dynamic_extent; }
-
        static constexpr size_t _S_rank = _Extents.size();
 
        // For __r in [0, _S_rank], _S_dynamic_index(__r) is the number
@@ -85,7 +90,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
          for (size_t __i = 0; __i < _S_rank; ++__i)
            {
              __ret[__i] = __dyn;
-             __dyn += _S_is_dyn(_Extents[__i]);
+             __dyn += (_Extents[__i] == dynamic_extent);
            }
          __ret[_S_rank] = __dyn;
          return __ret;
@@ -103,7 +108,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
        {
          array<size_t, _S_rank_dynamic> __ret;
          for (size_t __i = 0, __r = 0; __i < _S_rank; ++__i)
-           if (_S_is_dyn(_Extents[__i]))
+           if (_Extents[__i] == dynamic_extent)
              __ret[__r++] = __i;
          return __ret;
        }();
@@ -148,6 +153,17 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
        using _S_base::_S_dynamic_index_inv;
        using _S_base::_S_static_extent;
 
+       static constexpr bool
+       _S_is_dynamic(size_t __r) noexcept
+       {
+         if constexpr (__all_static(_Extents))
+           return false;
+         else if constexpr (__all_dynamic(_Extents))
+           return true;
+         else
+           return _Extents[__r] == dynamic_extent;
+       }
+
        template<typename _OIndexType>
          static constexpr _IndexType
          _S_int_cast(const _OIndexType& __other) noexcept
@@ -156,11 +172,10 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
        constexpr _IndexType
        _M_extent(size_t __r) const noexcept
        {
-         auto __se = _Extents[__r];
-         if (__se == dynamic_extent)
+         if (_S_is_dynamic(__r))
            return _M_dyn_exts[_S_dynamic_index(__r)];
          else
-           return __se;
+           return _S_static_extent(__r);
        }
 
        template<size_t _OtherRank, typename _GetOtherExtent>
@@ -169,7 +184,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
          {
            if constexpr (_OtherRank == _S_rank)
              for (size_t __i = 0; __i < _S_rank; ++__i)
-               if (_Extents[__i] != dynamic_extent
+               if (!_S_is_dynamic(__i)
                    && !cmp_equal(_Extents[__i], 
_S_int_cast(__get_extent(__i))))
                  return false;
            return true;
-- 
2.49.0

Reply via email to