On Sun, Aug 3, 2025 at 11:16 PM Luc Grosheintz <luc.groshei...@gmail.com>
wrote:

> In both fully static and dynamic extents the comparison
>   static_extent(i) == dynamic_extent
> is known at compile time. As a result, extents::extent doesn't
> need to perform the check at runtime.
>
> An illustrative example is:
>
>   using E = std::extents<int, 3, 5, 7, 11, 13, 17>;
>   int required_span_size(const typename Layout::mapping<E>& m)
>   { return m.required_span_size(); }
>
> Prior to this commit the generated code (on -O2) is:
>
>  2a0:  b9 01 00 00 00         mov    ecx,0x1
>  2a5:  31 d2                  xor    edx,edx
>  2a7:  66 66 2e 0f 1f 84 00   data16 cs nop WORD PTR [rax+rax*1+0x0]
>  2ae:  00 00 00 00
>  2b2:  66 66 2e 0f 1f 84 00   data16 cs nop WORD PTR [rax+rax*1+0x0]
>  2b9:  00 00 00 00
>  2bd:  0f 1f 00               nop    DWORD PTR [rax]
>  2c0:  48 8b 04 d5 00 00 00   mov    rax,QWORD PTR [rdx*8+0x0]
>  2c7:  00
>  2c8:  48 83 f8 ff            cmp    rax,0xffffffffffffffff
>  2cc:  0f 84 00 00 00 00      je     2d2
> <required_span_size_6d_static+0x32>
>  2d2:  83 e8 01               sub    eax,0x1
>  2d5:  0f af 04 97            imul   eax,DWORD PTR [rdi+rdx*4]
>  2d9:  48 83 c2 01            add    rdx,0x1
>  2dd:  01 c1                  add    ecx,eax
>  2df:  48 83 fa 06            cmp    rdx,0x6
>  2e3:  75 db                  jne    2c0
> <required_span_size_6d_static+0x20>
>  2e5:  89 c8                  mov    eax,ecx
>  2e7:  c3                     ret
>
> which is a scalar loop, and notably includes the check
>
>   308:  48 83 f8 ff            cmp    rax,0xffffffffffffffff
>
> to assert that the static extent is indeed not -1. Note, that on -O3 the
> optimizer eliminates the comparison; and generates a sequence of scalar
> operations: lea, shl, add and mov. The aim of this commit is to
> eliminate this comparison also for -O2. With the optimization applied we
> get:
>
>   2e0:  f3 0f 6f 0f            movdqu xmm1,XMMWORD PTR [rdi]
>   2e4:  66 0f 6f 15 00 00 00   movdqa xmm2,XMMWORD PTR [rip+0x0]
>   2eb:  00
>   2ec:  8b 57 10               mov    edx,DWORD PTR [rdi+0x10]
>   2ef:  66 0f 6f c1            movdqa xmm0,xmm1
>   2f3:  66 0f 73 d1 20         psrlq  xmm1,0x20
>   2f8:  66 0f f4 c2            pmuludq xmm0,xmm2
>   2fc:  66 0f 73 d2 20         psrlq  xmm2,0x20
>   301:  8d 14 52               lea    edx,[rdx+rdx*2]
>   304:  66 0f f4 ca            pmuludq xmm1,xmm2
>   308:  66 0f 70 c0 08         pshufd xmm0,xmm0,0x8
>   30d:  66 0f 70 c9 08         pshufd xmm1,xmm1,0x8
>   312:  66 0f 62 c1            punpckldq xmm0,xmm1
>   316:  66 0f 6f c8            movdqa xmm1,xmm0
>   31a:  66 0f 73 d9 08         psrldq xmm1,0x8
>   31f:  66 0f fe c1            paddd  xmm0,xmm1
>   323:  66 0f 6f c8            movdqa xmm1,xmm0
>   327:  66 0f 73 d9 04         psrldq xmm1,0x4
>   32c:  66 0f fe c1            paddd  xmm0,xmm1
>   330:  66 0f 7e c0            movd   eax,xmm0
>   334:  8d 54 90 01            lea    edx,[rax+rdx*4+0x1]
>   338:  8b 47 14               mov    eax,DWORD PTR [rdi+0x14]
>   33b:  c1 e0 04               shl    eax,0x4
>   33e:  01 d0                  add    eax,edx
>   340:  c3                     ret
>
> Which shows eliminating the trivial comparison, unlocks a new set of
> optimizations, i.e. SIMD-vectorization. In particular, the loop has been
> vectorized by loading the first four constants from aligned memory; the
> first four strides from non-aligned memory, then computes the product
> and reduction. It interleaves the above with computing 1 + 12*S[4] +
> 16*S[5] (as scalar operations) and then finishes the reduction.
>
> A similar effect can be observed for fully dynamic extents.
>
> libstdc++-v3/ChangeLog:
>
>         * include/std/mdspan (__mdspan::__all_static): New function.
>         (__mdspan::_StaticExtents::_S_is_dyn): Inline and eliminate.
>         (__mdspan::_ExtentsStorage::_S_is_dynamic): New method.
>         (__mdspan::_ExtentsStorage::_M_extent): Use _S_is_dynamic.
>
> Signed-off-by: Luc Grosheintz <luc.groshei...@gmail.com>
> ---
>
Again applying locally small changes.

>  libstdc++-v3/include/std/mdspan | 35 +++++++++++++++++++++++----------
>  1 file changed, 25 insertions(+), 10 deletions(-)
>
> diff --git a/libstdc++-v3/include/std/mdspan
> b/libstdc++-v3/include/std/mdspan
> index ccf1028466f..ee0c2b65c7f 100644
> --- a/libstdc++-v3/include/std/mdspan
> +++ b/libstdc++-v3/include/std/mdspan
> @@ -49,6 +49,15 @@ namespace std _GLIBCXX_VISIBILITY(default)
>  _GLIBCXX_BEGIN_NAMESPACE_VERSION
>    namespace __mdspan
>    {
> +    template<array _Extents>
> +      consteval bool
> +      __all_static(size_t __begin = 0, size_t __end = _Extents.size())
>
I pass span here.

> +      {
> +       for(size_t __i = __begin; __i < __end; ++__i)
> +         if (_Extents[__i] == dynamic_extent)
> +           return false;
> +       return true;
> +      }
>
>      template<array _Extents>
>        consteval bool
> @@ -64,10 +73,6 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
>        class _StaticExtents
>        {
>        public:
> -       static consteval bool
> -       _S_is_dyn(size_t __ext) noexcept
> -       { return __ext == dynamic_extent; }
> -
>         static constexpr size_t _S_rank = _Extents.size();
>
>         // For __r in [0, _S_rank], _S_dynamic_index(__r) is the number
> @@ -87,7 +92,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
>           for (size_t __i = 0; __i < _S_rank; ++__i)
>             {
>               __ret[__i] = __dyn;
> -             __dyn += _S_is_dyn(_Extents[__i]);
> +             __dyn += _Extents[__i] == dynamic_extent;
>
Added parenthesis here:
 __dyn += (_Extents[__i] == dynamic_extent);

>             }
>           __ret[_S_rank] = __dyn;
>           return __ret;
> @@ -105,7 +110,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
>         {
>           array<size_t, _S_rank_dynamic> __ret;
>           for (size_t __i = 0, __r = 0; __i < _S_rank; ++__i)
> -           if (_S_is_dyn(_Extents[__i]))
> +           if (_Extents[__i] == dynamic_extent)
>               __ret[__r++] = __i;
>           return __ret;
>         }();
> @@ -150,6 +155,17 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
>         using _S_base::_S_dynamic_index_inv;
>         using _S_base::_S_static_extent;
>
> +       static constexpr bool
> +       _S_is_dynamic(size_t __r) noexcept
> +       {
> +         if constexpr (__all_static<_Extents>())
> +           return false;
> +         else if constexpr (__all_dynamic<_Extents>())
> +           return true;
> +         else
> +           return _Extents[__r] == dynamic_extent;
> +       }
>
And this is now:
        static constexpr bool
        _S_is_dynamic(size_t __r) noexcept
        {
          if constexpr (__all_static(_Extents))
            return false;


          else if constexpr (__all_dynamic(_Extents))


            return true;
          else
            return _Extents[__r] == dynamic_extent;
        }

> +
>         template<typename _OIndexType>
>           static constexpr _IndexType
>           _S_int_cast(const _OIndexType& __other) noexcept
> @@ -158,11 +174,10 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
>         constexpr _IndexType
>         _M_extent(size_t __r) const noexcept
>         {
> -         auto __se = _Extents[__r];
> -         if (__se == dynamic_extent)
> +         if (_S_is_dynamic(__r))
>             return _M_dyn_exts[_S_dynamic_index(__r)];
>           else
> -           return __se;
> +           return _S_static_extent(__r);
>         }
>
>         template<size_t _OtherRank, typename _GetOtherExtent>
> @@ -171,7 +186,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
>           {
>             if constexpr (_OtherRank == _S_rank)
>               for (size_t __i = 0; __i < _S_rank; ++__i)
> -               if (_Extents[__i] != dynamic_extent
> +               if (!_S_is_dynamic(__i)
>                     && !cmp_equal(_Extents[__i],
> _S_int_cast(__get_extent(__i))))
>                   return false;
>             return true;
> --
> 2.50.0
>
>

Reply via email to