On Fri, 27 Dec 2024 at 20:13, Jan Hubicka <hubi...@ucw.cz> wrote:
>
> Hi,
> the following testcase:
>
>   bool f(const std::vector<bool>& v, std::size_t x) {
>     return v[x];
>   }
>
> is compiled as:
>
> f(std::vector<bool, std::allocator<bool> > const&, unsigned long):
>         testq   %rsi, %rsi
>         leaq    63(%rsi), %rax
>         movq    (%rdi), %rdx
>         cmovns  %rsi, %rax
>         sarq    $6, %rax
>         leaq    (%rdx,%rax,8), %rdx
>         movq    %rsi, %rax
>         sarq    $63, %rax
>         shrq    $58, %rax
>         addq    %rax, %rsi
>         andl    $63, %esi
>         subq    %rax, %rsi
>         jns     .L2
>         addq    $64, %rsi
>         subq    $8, %rdx
> .L2:
>         movl    $1, %eax
>         shlx    %rsi, %rax, %rax
>         andq    (%rdx), %rax
>         setne   %al
>         ret
>
> which is quite expensive for simple bit access in a bitmap.  The reason is 
> that
> the bit access is implemented using iterators
>         return begin()[__n];
> Which in turn cares about situation where __n is negative yielding the extra
> conditional.
>
>     _GLIBCXX20_CONSTEXPR
>     void
>     _M_incr(ptrdiff_t __i)
>     {
>       _M_assume_normalized();
>       difference_type __n = __i + _M_offset;
>       _M_p += __n / int(_S_word_bit);
>       __n = __n % int(_S_word_bit);
>       if (__n < 0)
>         {
>           __n += int(_S_word_bit);
>           --_M_p;
>         }
>       _M_offset = static_cast<unsigned int>(__n);
>     }
>
> While we can use __builtin_unreachable to declare that __n is in range
> 0...max_size () but I think it is better to implement it directly, since
> resulting code is shorter and much easier to optimize.

Yeah I think that change makes sense, OK for trunk, thanks!

>
> We now porduce:
> .LFB1248:
>         .cfi_startproc
>         movq    (%rdi), %rax
>         movq    %rsi, %rdx
>         shrq    $6, %rdx
>         andq    (%rax,%rdx,8), %rsi
>         andl    $63, %esi
>         setne   %al
>         ret
>
> Testcase suggests
>         movq    (%rdi), %rax
>         movl    %esi, %ecx
>         shrq    $5, %rsi        # does still need to be 64-bit
>         movl    (%rax,%rsi,4), %eax
>         btl     %ecx, %eax
>         setb    %al
>         retq
> Which is still one instruction shorter.
>
> Bootstrapped/regtested x86_64-linux, OK?
>
> libstdc++-v3/ChangeLog:
>
>         PR target/80813
>         * include/bits/stl_bvector.h (vector<bool, _Alloc>::operator []): Do
>         not use iterators.
>
> gcc/testsuite/ChangeLog:
>
>         PR target/80813
>         * g++.dg/tree-ssa/bvector-3.C: New test.
>
> diff --git a/gcc/testsuite/g++.dg/tree-ssa/bvector-3.C 
> b/gcc/testsuite/g++.dg/tree-ssa/bvector-3.C
> new file mode 100644
> index 00000000000..feae791b20d
> --- /dev/null
> +++ b/gcc/testsuite/g++.dg/tree-ssa/bvector-3.C
> @@ -0,0 +1,10 @@
> +// { dg-do compile }
> +// { dg-options "-O2 -fdump-tree-optimized"  }
> +// { dg-skip-if "requires hosted libstdc++ for vector" { ! hostedlib } }
> +
> +#include <vector>
> +bool f(const std::vector<bool>& v, std::size_t x) {
> +           return v[x];
> +}
> +// All references to src should be optimized out, so there should be no name 
> for it
> +// { dg-final { scan-tree-dump-not "if \\("  optimized } }
> diff --git a/libstdc++-v3/include/bits/stl_bvector.h 
> b/libstdc++-v3/include/bits/stl_bvector.h
> index 341eee33b21..975857bfdbd 100644
> --- a/libstdc++-v3/include/bits/stl_bvector.h
> +++ b/libstdc++-v3/include/bits/stl_bvector.h
> @@ -1132,7 +1141,9 @@ _GLIBCXX_BEGIN_NAMESPACE_CONTAINER
>        operator[](size_type __n)
>        {
>         __glibcxx_requires_subscript(__n);
> -       return begin()[__n];
> +       return _Bit_reference (this->_M_impl._M_start._M_p
> +                              + __n / int(_S_word_bit),
> +                              1UL << __n % int(_S_word_bit));
>        }
>
>        _GLIBCXX_NODISCARD _GLIBCXX20_CONSTEXPR
> @@ -1140,7 +1151,9 @@ _GLIBCXX_BEGIN_NAMESPACE_CONTAINER
>        operator[](size_type __n) const
>        {
>         __glibcxx_requires_subscript(__n);
> -       return begin()[__n];
> +       return _Bit_reference (this->_M_impl._M_start._M_p
> +                              + __n / int(_S_word_bit),
> +                              1UL << __n % int(_S_word_bit));
>        }
>
>      protected:
>

Reply via email to