On Sun, 8 Dec 2024 at 15:36, Jan Hubicka <hubi...@ucw.cz> wrote:
>
> Hi,
> std::vector has independent implementation for bool which has its won
> size/capacity functions.  I updated them to add __builtin_unreachable to
> announce that size is never more than max_size.  However while testing the 
> code
> I noticed that even construction of unused copy is not optimized out.  Main
> problem is that the vector copying loops copies the tail of vector using loop
> that copies bit by bit.  We eventually pattern match it to bit operations 
> (that
> surprises me) but we need to unroll it and run through loop optimization that
> happens late.
>
> This patch also updates copy_aglined to use bit operations. However for = 
> operation
> we can do better since it is not necessary to preserve original bits (those 
> are
> undefined anyway). So I added copy_aglined_trail (better name would be 
> welcome)
> that does not care about trailing bits of the copied block.
>
> As a result the following functions are optimized to empty functions:
> #include <vector>
> bool
> empty(std::vector<bool> src)
> {
>         std::vector <bool> data=src;
>         return false;
> }
> bool
> empty2(std::vector<bool> src)
> {
>         std::vector <bool> data;
>         data.reserve(1);
>         return data.size ();
> }
> bool
> empty3()
> {
>         std::vector <bool> data;
>         data.push_back (true);
>         return data[0];
> }
>
> Finally I mirrored changes to push_back from normal vectors to bit vectors.
> This involve separating append from insert and breaking out the resizing (and
> cold) path to separate function.
>
> Here are two little benchmarks on push_back:
>
> #include <vector>
>
> __attribute__ ((noipa))
> std::vector<bool>
> test()
> {
>         std::vector <bool> t;
>         t.push_back (1);
>         t.push_back (2);
>         return t;
> }
> int
> main()
> {
>         for (int i = 0; i < 100000000; i++)
>         {
>                 test();
>         }
>         return 0;
> }
>
>                                                  runtime(s) .text size of 
> a.out
> gcc -O2                                             1.04    1606
> gcc -O2 + patch                                     0.98    1315
> gcc -O3                                             0.98    1249
> gcc -O3 + patch                                     0.96    1138
> gcc -O3 + patch --param max-inline-insns-auto=5000  0.96    1138
> clang -O2                                           1.56    1823
> clang -O3                                           1.56    1839
> clang -O2 + libc++                                  2.31    4272
> clang -O3 + libc++                                  2.76    4262
>
> push_back is still too large to inline fully at -O3.  If parameters are 
> bumped up
> for small vectors this makes it possible to propagate bit positions.
> This variant of benchmark
>
> #include <vector>
>
> __attribute__ ((noipa))
> std::vector<bool>
> test()
> {
>         std::vector <bool> t;
>         t.push_back (1);
>         t.push_back (2);
>         return t;
> }
> int
> main()
> {
>         for (int i = 0; i < 100000000; i++)
>         {
>                 test();
>         }
>         return 0;
> }
>
>
>                                                  runtime(s) .text size of 
> a.out
> gcc -O2                                             1.48    1574
> gcc -O2 + patch                                     1.30    1177
> gcc -O3                                             1.46    1515
> gcc -O3 + patch                                     1.27    1069
> gcc -O3 + patch --param max-inline-insns-auto=5000  1.10    388
> clang -O2                                           1.48    1823
> clang -O3                                           1.46    1711
> clang -O2 + libc++                                  1.20    1001
> clang -O3 + libc++                                  1.17    1001
>
> Note that clang does not suport noipa attribute, so it has some advantage 
> here.
> Bootstrapped/regtested x86_64-linux, OK?
>
> libstdc++-v3/ChangeLog:
>
>         * include/bits/stl_bvector.h (vector<bool, _Alloc>::operator=): Use
>         _M_copy_aligned_trail.
>         (vector<bool, _Alloc>::copy_aglined_trail): New function.

Spelling

>         (vector<bool, _Alloc>::copy_aglined): Implement copying of tail 
> manually.

Spelling

>         (vector<bool, _Alloc>::size,vector<bool, _Alloc>::capacity): Add check
>         that size is at most max_size.
>         (vector<bool, _Alloc>::size,vector<bool, _Alloc>::push_back): Use
>         _M_append_aux.
>         (vector<bool, _Alloc>::size,vector<bool, _Alloc>::_M_append_aux): 
> Declare.
>         (vector<bool, _Alloc>::size,vector<bool, 
> _Alloc>::_M_realloc_append_aux): Declare.
>         (vector<bool, _Alloc>::size,vector<bool, 
> _Alloc>::_M_realloc_insert_aux): Declare.
>         * include/bits/vector.tcc
>         (vector<bool, _Alloc>::size,vector<bool, _Alloc>::_M_append_aux): 
> Implement.
>         (vector<bool, _Alloc>::size,vector<bool, 
> _Alloc>::_M_realloc_append_aux): Implement.
>         (vector<bool, _Alloc>::size,vector<bool, 
> _Alloc>::_M_realloc_insert_aux): Break out from ...
>         (vector<bool, _Alloc>::size,vector<bool, _Alloc>::_M_insert_aux): ... 
> here.
>
> gcc/testsuite/ChangeLog:
>
>         * g++.dg/tree-ssa/bvector-1.C: New test.
>
> diff --git a/gcc/testsuite/g++.dg/tree-ssa/bvector-1.C 
> b/gcc/testsuite/g++.dg/tree-ssa/bvector-1.C
> new file mode 100644
> index 00000000000..4197ff8c4b8
> --- /dev/null
> +++ b/gcc/testsuite/g++.dg/tree-ssa/bvector-1.C
> @@ -0,0 +1,28 @@
> +// { dg-do compile }
> +// { dg-options "-O2 -fdump-tree-optimized"  }
> +// { dg-skip-if "requires hosted libstdc++ for vector" { ! hostedlib } }
> +#include <vector>
> +bool
> +empty(std::vector<bool> src)
> +{
> +       std::vector <bool> data=src;
> +       return false;
> +}
> +bool
> +empty2(std::vector<bool> src)
> +{
> +       std::vector <bool> data;
> +       data.reserve(1);
> +       return data.size ();
> +}
> +bool
> +empty3()
> +{
> +       std::vector <bool> data;
> +       data.push_back (true);
> +       return data[0];
> +}
> +// All references to src should be optimized out, so there should be no name 
> for it
> +// { dg-final { scan-tree-dump-not "src_"  optimized } }
> +// { dg-final { scan-tree-dump-not "data_"  optimized } }
> +// { dg-final { scan-tree-dump-not "new"  optimized } }
> diff --git a/libstdc++-v3/include/bits/stl_bvector.h 
> b/libstdc++-v3/include/bits/stl_bvector.h
> index 341eee33b21..ce1c5121b38 100644
> --- a/libstdc++-v3/include/bits/stl_bvector.h
> +++ b/libstdc++-v3/include/bits/stl_bvector.h
> @@ -946,8 +946,8 @@ _GLIBCXX_BEGIN_NAMESPACE_CONTAINER
>             this->_M_deallocate();
>             _M_initialize(__x.size());
>           }
> -       this->_M_impl._M_finish = _M_copy_aligned(__x.begin(), __x.end(),
> -                                                 begin());
> +       this->_M_impl._M_finish = _M_copy_aligned_trail(__x.begin(), 
> __x.end(),
> +                                                       begin());
>         return *this;
>        }
>
> @@ -971,8 +971,8 @@ _GLIBCXX_BEGIN_NAMESPACE_CONTAINER
>                 this->_M_deallocate();
>                 _M_initialize(__x.size());
>               }
> -           this->_M_impl._M_finish = _M_copy_aligned(__x.begin(), __x.end(),
> -                                                     begin());
> +           this->_M_impl._M_finish = _M_copy_aligned_trail(__x.begin(), 
> __x.end(),
> +                                                             begin());
>             __x.clear();
>           }
>         return *this;
> @@ -1101,7 +1101,12 @@ _GLIBCXX_BEGIN_NAMESPACE_CONTAINER
>        _GLIBCXX_NODISCARD _GLIBCXX20_CONSTEXPR
>        size_type
>        size() const _GLIBCXX_NOEXCEPT
> -      { return size_type(end() - begin()); }
> +      {
> +       const size_type __sz = size_type (end() - begin());
> +       if (__sz > max_size ())
> +          __builtin_unreachable ();
> +       return __sz;
> +      }
>
>        _GLIBCXX_NODISCARD _GLIBCXX20_CONSTEXPR
>        size_type
> @@ -1119,8 +1124,12 @@ _GLIBCXX_BEGIN_NAMESPACE_CONTAINER
>        _GLIBCXX_NODISCARD _GLIBCXX20_CONSTEXPR
>        size_type
>        capacity() const _GLIBCXX_NOEXCEPT
> -      { return size_type(const_iterator(this->_M_impl._M_end_addr(), 0)
> -                        - begin()); }
> +      {
> +       const size_type __sz = 
> size_type(const_iterator(this->_M_impl._M_end_addr(), 0) - begin());
> +       if (__sz > max_size ())
> +          __builtin_unreachable ();
> +       return __sz;
> +      }
>
>        _GLIBCXX_NODISCARD _GLIBCXX20_CONSTEXPR
>        bool
> @@ -1221,7 +1230,7 @@ _GLIBCXX_BEGIN_NAMESPACE_CONTAINER
>         if (this->_M_impl._M_finish._M_p != this->_M_impl._M_end_addr())
>           *this->_M_impl._M_finish++ = __x;
>         else
> -         _M_insert_aux(end(), __x);
> +         _M_append_aux(__x);
>        }
>
>        _GLIBCXX20_CONSTEXPR
> @@ -1484,8 +1493,19 @@ _GLIBCXX_BEGIN_NAMESPACE_CONTAINER
>                       iterator __result)
>        {
>         _Bit_type* __q = std::copy(__first._M_p, __last._M_p, __result._M_p);
> -       return std::copy(const_iterator(__last._M_p, 0), __last,
> -                        iterator(__q, 0));
> +       if (!__last._M_offset)
> +         return iterator(__q, 0);
> +       _Bit_type __mask = ~0ul << __last._M_offset;
> +       *__q = (*__q & __mask) | (*__last._M_p & ~__mask);
> +       return iterator(__q, __last._M_offset);
> +      }

Newline between functions please.

This has the same precondition as _M_copy_aligned, right? It should
repeat the comment above _M_copy_aligned. It would also be nice to
have a comment saying what it does, and I agree a better name would
help.

Maybe "Copies [first,last2) to result, where last2 is the first word
boundary not less than last" or something like that?

How about _M_copy_aligned_ceil ? Since it can be considered to do a
"ceil" operation on the _M_last iterator, copying past its offset when
non-zero?


> +      _GLIBCXX20_CONSTEXPR
> +      iterator
> +      _M_copy_aligned_trail(const_iterator __first, const_iterator __last,
> +                           iterator __result)
> +      {
> +       _Bit_type* __q = std::copy(__first._M_p, __last._M_p + 
> (__last._M_offset != 0), __result._M_p);
> +       return iterator(__q - (__last._M_offset != 0), __last._M_offset);
>        }
>
>        _GLIBCXX20_CONSTEXPR
> @@ -1669,6 +1689,18 @@ _GLIBCXX_BEGIN_NAMESPACE_CONTAINER
>        void
>        _M_insert_aux(iterator __position, bool __x);
>
> +      _GLIBCXX20_CONSTEXPR
> +      void
> +      _M_realloc_insert_aux(iterator __position, bool __x);
> +
> +      _GLIBCXX20_CONSTEXPR
> +      void
> +      _M_append_aux(bool __x);
> +
> +      _GLIBCXX20_CONSTEXPR
> +      void
> +      _M_realloc_append_aux(bool __x);
> +
>        _GLIBCXX20_CONSTEXPR
>        size_type
>        _M_check_len(size_type __n, const char* __s) const
> diff --git a/libstdc++-v3/include/bits/vector.tcc 
> b/libstdc++-v3/include/bits/vector.tcc
> index 542a66173a3..f9cc78d6dcf 100644
> --- a/libstdc++-v3/include/bits/vector.tcc
> +++ b/libstdc++-v3/include/bits/vector.tcc
> @@ -1182,6 +1182,24 @@ _GLIBCXX_BEGIN_NAMESPACE_CONTAINER
>               }
>           }
>        }
> +  template<typename _Alloc>
> +    _GLIBCXX20_CONSTEXPR
> +    void
> +    vector<bool, _Alloc>::
> +    _M_realloc_insert_aux(iterator __position, bool __x)
> +    {
> +      const size_type __len =
> +       _M_check_len(size_type(1), "vector<bool>::_M_realloc_insert_aux");
> +      _Bit_pointer __q = this->_M_allocate(__len);
> +      iterator __start(std::__addressof(*__q), 0);
> +      iterator __i = _M_copy_aligned(begin(), __position, __start);
> +      *__i++ = __x;
> +      iterator __finish = std::copy(__position, end(), __i);
> +      this->_M_deallocate();
> +      this->_M_impl._M_end_of_storage = __q + _S_nword(__len);
> +      this->_M_impl._M_start = __start;
> +      this->_M_impl._M_finish = __finish;
> +    }
>
>    template<typename _Alloc>
>      _GLIBCXX20_CONSTEXPR
> @@ -1197,19 +1215,40 @@ _GLIBCXX_BEGIN_NAMESPACE_CONTAINER
>           ++this->_M_impl._M_finish;
>         }
>        else
> +       _M_realloc_insert_aux (__position, __x);
> +    }
> +
> +  template<typename _Alloc>
> +    _GLIBCXX20_CONSTEXPR
> +    void
> +    vector<bool, _Alloc>::
> +    _M_realloc_append_aux(bool __x)
> +    {
> +      const size_type __len =
> +       _M_check_len(size_type(1), "vector<bool>::_M_realloc_append_aux");
> +      _Bit_pointer __q = this->_M_allocate(__len);
> +      iterator __start(std::__addressof(*__q), 0);
> +      iterator __i = _M_copy_aligned_trail(begin(), end(), __start);
> +      *__i++ = __x;
> +      this->_M_deallocate();
> +      this->_M_impl._M_end_of_storage = __q + _S_nword(__len);
> +      this->_M_impl._M_start = __start;
> +      this->_M_impl._M_finish = __i;
> +    }
> +
> +  template<typename _Alloc>
> +    _GLIBCXX20_CONSTEXPR
> +    void
> +    vector<bool, _Alloc>::
> +    _M_append_aux(bool __x)
> +    {
> +      if (this->_M_impl._M_finish._M_p != this->_M_impl._M_end_addr())
>         {
> -         const size_type __len =
> -           _M_check_len(size_type(1), "vector<bool>::_M_insert_aux");
> -         _Bit_pointer __q = this->_M_allocate(__len);
> -         iterator __start(std::__addressof(*__q), 0);
> -         iterator __i = _M_copy_aligned(begin(), __position, __start);
> -         *__i++ = __x;
> -         iterator __finish = std::copy(__position, end(), __i);
> -         this->_M_deallocate();
> -         this->_M_impl._M_end_of_storage = __q + _S_nword(__len);
> -         this->_M_impl._M_start = __start;
> -         this->_M_impl._M_finish = __finish;
> +         *end() = __x;
> +         ++this->_M_impl._M_finish;
>         }
> +      else
> +       _M_realloc_append_aux (__x);
>      }
>
>    template<typename _Alloc>
>

Reply via email to