Hi,
the following testcase:

  bool f(const std::vector<bool>& v, std::size_t x) {
    return v[x];
  }

is compiled as:

f(std::vector<bool, std::allocator<bool> > const&, unsigned long):
        testq   %rsi, %rsi
        leaq    63(%rsi), %rax
        movq    (%rdi), %rdx
        cmovns  %rsi, %rax
        sarq    $6, %rax
        leaq    (%rdx,%rax,8), %rdx
        movq    %rsi, %rax
        sarq    $63, %rax
        shrq    $58, %rax
        addq    %rax, %rsi
        andl    $63, %esi
        subq    %rax, %rsi
        jns     .L2
        addq    $64, %rsi
        subq    $8, %rdx
.L2:
        movl    $1, %eax
        shlx    %rsi, %rax, %rax
        andq    (%rdx), %rax
        setne   %al
        ret

which is quite expensive for simple bit access in a bitmap.  The reason is that
the bit access is implemented using iterators
        return begin()[__n];
Which in turn cares about situation where __n is negative yielding the extra
conditional.

    _GLIBCXX20_CONSTEXPR
    void
    _M_incr(ptrdiff_t __i)
    {
      _M_assume_normalized();
      difference_type __n = __i + _M_offset;
      _M_p += __n / int(_S_word_bit);
      __n = __n % int(_S_word_bit);
      if (__n < 0)
        {
          __n += int(_S_word_bit);
          --_M_p;
        }
      _M_offset = static_cast<unsigned int>(__n);
    }

While we can use __builtin_unreachable to declare that __n is in range
0...max_size () but I think it is better to implement it directly, since
resulting code is shorter and much easier to optimize.

We now porduce:
.LFB1248:
        .cfi_startproc
        movq    (%rdi), %rax
        movq    %rsi, %rdx
        shrq    $6, %rdx
        andq    (%rax,%rdx,8), %rsi
        andl    $63, %esi
        setne   %al
        ret

Testcase suggests
        movq    (%rdi), %rax
        movl    %esi, %ecx
        shrq    $5, %rsi        # does still need to be 64-bit
        movl    (%rax,%rsi,4), %eax
        btl     %ecx, %eax
        setb    %al
        retq
Which is still one instruction shorter.

Bootstrapped/regtested x86_64-linux, OK?

libstdc++-v3/ChangeLog:

        PR target/80813
        * include/bits/stl_bvector.h (vector<bool, _Alloc>::operator []): Do
        not use iterators.

gcc/testsuite/ChangeLog:

        PR target/80813
        * g++.dg/tree-ssa/bvector-3.C: New test.

diff --git a/gcc/testsuite/g++.dg/tree-ssa/bvector-3.C 
b/gcc/testsuite/g++.dg/tree-ssa/bvector-3.C
new file mode 100644
index 00000000000..feae791b20d
--- /dev/null
+++ b/gcc/testsuite/g++.dg/tree-ssa/bvector-3.C
@@ -0,0 +1,10 @@
+// { dg-do compile }
+// { dg-options "-O2 -fdump-tree-optimized"  }
+// { dg-skip-if "requires hosted libstdc++ for vector" { ! hostedlib } }
+
+#include <vector>
+bool f(const std::vector<bool>& v, std::size_t x) {
+           return v[x];
+}
+// All references to src should be optimized out, so there should be no name 
for it
+// { dg-final { scan-tree-dump-not "if \\("  optimized } }
diff --git a/libstdc++-v3/include/bits/stl_bvector.h 
b/libstdc++-v3/include/bits/stl_bvector.h
index 341eee33b21..975857bfdbd 100644
--- a/libstdc++-v3/include/bits/stl_bvector.h
+++ b/libstdc++-v3/include/bits/stl_bvector.h
@@ -1132,7 +1141,9 @@ _GLIBCXX_BEGIN_NAMESPACE_CONTAINER
       operator[](size_type __n)
       {
        __glibcxx_requires_subscript(__n);
-       return begin()[__n];
+       return _Bit_reference (this->_M_impl._M_start._M_p
+                              + __n / int(_S_word_bit),
+                              1UL << __n % int(_S_word_bit));
       }
 
       _GLIBCXX_NODISCARD _GLIBCXX20_CONSTEXPR
@@ -1140,7 +1151,9 @@ _GLIBCXX_BEGIN_NAMESPACE_CONTAINER
       operator[](size_type __n) const
       {
        __glibcxx_requires_subscript(__n);
-       return begin()[__n];
+       return _Bit_reference (this->_M_impl._M_start._M_p
+                              + __n / int(_S_word_bit),
+                              1UL << __n % int(_S_word_bit));
       }
 
     protected:

Reply via email to