On Fri, 10 Jan 2025 at 14:51, Jonathan Wakely <jwak...@redhat.com> wrote:
>
> On Sun, 8 Dec 2024 at 15:36, Jan Hubicka <hubi...@ucw.cz> wrote:
> >
> > Hi,
> > std::vector has independent implementation for bool which has its won
> > size/capacity functions.  I updated them to add __builtin_unreachable to
> > announce that size is never more than max_size.  However while testing the 
> > code
> > I noticed that even construction of unused copy is not optimized out.  Main
> > problem is that the vector copying loops copies the tail of vector using 
> > loop
> > that copies bit by bit.  We eventually pattern match it to bit operations 
> > (that
> > surprises me) but we need to unroll it and run through loop optimization 
> > that
> > happens late.
> >
> > This patch also updates copy_aglined to use bit operations. However for = 
> > operation
> > we can do better since it is not necessary to preserve original bits (those 
> > are
> > undefined anyway). So I added copy_aglined_trail (better name would be 
> > welcome)
> > that does not care about trailing bits of the copied block.
> >
> > As a result the following functions are optimized to empty functions:
> > #include <vector>
> > bool
> > empty(std::vector<bool> src)
> > {
> >         std::vector <bool> data=src;
> >         return false;
> > }
> > bool
> > empty2(std::vector<bool> src)
> > {
> >         std::vector <bool> data;
> >         data.reserve(1);
> >         return data.size ();
> > }
> > bool
> > empty3()
> > {
> >         std::vector <bool> data;
> >         data.push_back (true);
> >         return data[0];
> > }
> >
> > Finally I mirrored changes to push_back from normal vectors to bit vectors.
> > This involve separating append from insert and breaking out the resizing 
> > (and
> > cold) path to separate function.
> >
> > Here are two little benchmarks on push_back:
> >
> > #include <vector>
> >
> > __attribute__ ((noipa))
> > std::vector<bool>
> > test()
> > {
> >         std::vector <bool> t;
> >         t.push_back (1);
> >         t.push_back (2);
> >         return t;
> > }
> > int
> > main()
> > {
> >         for (int i = 0; i < 100000000; i++)
> >         {
> >                 test();
> >         }
> >         return 0;
> > }
> >
> >                                                  runtime(s) .text size of 
> > a.out
> > gcc -O2                                             1.04    1606
> > gcc -O2 + patch                                     0.98    1315
> > gcc -O3                                             0.98    1249
> > gcc -O3 + patch                                     0.96    1138
> > gcc -O3 + patch --param max-inline-insns-auto=5000  0.96    1138
> > clang -O2                                           1.56    1823
> > clang -O3                                           1.56    1839
> > clang -O2 + libc++                                  2.31    4272
> > clang -O3 + libc++                                  2.76    4262
> >
> > push_back is still too large to inline fully at -O3.  If parameters are 
> > bumped up
> > for small vectors this makes it possible to propagate bit positions.
> > This variant of benchmark
> >
> > #include <vector>
> >
> > __attribute__ ((noipa))
> > std::vector<bool>
> > test()
> > {
> >         std::vector <bool> t;
> >         t.push_back (1);
> >         t.push_back (2);
> >         return t;
> > }
> > int
> > main()
> > {
> >         for (int i = 0; i < 100000000; i++)
> >         {
> >                 test();
> >         }
> >         return 0;
> > }
> >
> >
> >                                                  runtime(s) .text size of 
> > a.out
> > gcc -O2                                             1.48    1574
> > gcc -O2 + patch                                     1.30    1177
> > gcc -O3                                             1.46    1515
> > gcc -O3 + patch                                     1.27    1069
> > gcc -O3 + patch --param max-inline-insns-auto=5000  1.10    388
> > clang -O2                                           1.48    1823
> > clang -O3                                           1.46    1711
> > clang -O2 + libc++                                  1.20    1001
> > clang -O3 + libc++                                  1.17    1001
> >
> > Note that clang does not suport noipa attribute, so it has some advantage 
> > here.
> > Bootstrapped/regtested x86_64-linux, OK?
> >
> > libstdc++-v3/ChangeLog:
> >
> >         * include/bits/stl_bvector.h (vector<bool, _Alloc>::operator=): Use
> >         _M_copy_aligned_trail.
> >         (vector<bool, _Alloc>::copy_aglined_trail): New function.
>
> Spelling
>
> >         (vector<bool, _Alloc>::copy_aglined): Implement copying of tail 
> > manually.
>
> Spelling
>
> >         (vector<bool, _Alloc>::size,vector<bool, _Alloc>::capacity): Add 
> > check
> >         that size is at most max_size.
> >         (vector<bool, _Alloc>::size,vector<bool, _Alloc>::push_back): Use
> >         _M_append_aux.
> >         (vector<bool, _Alloc>::size,vector<bool, _Alloc>::_M_append_aux): 
> > Declare.
> >         (vector<bool, _Alloc>::size,vector<bool, 
> > _Alloc>::_M_realloc_append_aux): Declare.
> >         (vector<bool, _Alloc>::size,vector<bool, 
> > _Alloc>::_M_realloc_insert_aux): Declare.
> >         * include/bits/vector.tcc
> >         (vector<bool, _Alloc>::size,vector<bool, _Alloc>::_M_append_aux): 
> > Implement.
> >         (vector<bool, _Alloc>::size,vector<bool, 
> > _Alloc>::_M_realloc_append_aux): Implement.
> >         (vector<bool, _Alloc>::size,vector<bool, 
> > _Alloc>::_M_realloc_insert_aux): Break out from ...
> >         (vector<bool, _Alloc>::size,vector<bool, _Alloc>::_M_insert_aux): 
> > ... here.
> >
> > gcc/testsuite/ChangeLog:
> >
> >         * g++.dg/tree-ssa/bvector-1.C: New test.
> >
> > diff --git a/gcc/testsuite/g++.dg/tree-ssa/bvector-1.C 
> > b/gcc/testsuite/g++.dg/tree-ssa/bvector-1.C
> > new file mode 100644
> > index 00000000000..4197ff8c4b8
> > --- /dev/null
> > +++ b/gcc/testsuite/g++.dg/tree-ssa/bvector-1.C
> > @@ -0,0 +1,28 @@
> > +// { dg-do compile }
> > +// { dg-options "-O2 -fdump-tree-optimized"  }
> > +// { dg-skip-if "requires hosted libstdc++ for vector" { ! hostedlib } }
> > +#include <vector>
> > +bool
> > +empty(std::vector<bool> src)
> > +{
> > +       std::vector <bool> data=src;
> > +       return false;
> > +}
> > +bool
> > +empty2(std::vector<bool> src)
> > +{
> > +       std::vector <bool> data;
> > +       data.reserve(1);
> > +       return data.size ();
> > +}
> > +bool
> > +empty3()
> > +{
> > +       std::vector <bool> data;
> > +       data.push_back (true);
> > +       return data[0];
> > +}
> > +// All references to src should be optimized out, so there should be no 
> > name for it
> > +// { dg-final { scan-tree-dump-not "src_"  optimized } }
> > +// { dg-final { scan-tree-dump-not "data_"  optimized } }
> > +// { dg-final { scan-tree-dump-not "new"  optimized } }
> > diff --git a/libstdc++-v3/include/bits/stl_bvector.h 
> > b/libstdc++-v3/include/bits/stl_bvector.h
> > index 341eee33b21..ce1c5121b38 100644
> > --- a/libstdc++-v3/include/bits/stl_bvector.h
> > +++ b/libstdc++-v3/include/bits/stl_bvector.h
> > @@ -946,8 +946,8 @@ _GLIBCXX_BEGIN_NAMESPACE_CONTAINER
> >             this->_M_deallocate();
> >             _M_initialize(__x.size());
> >           }
> > -       this->_M_impl._M_finish = _M_copy_aligned(__x.begin(), __x.end(),
> > -                                                 begin());
> > +       this->_M_impl._M_finish = _M_copy_aligned_trail(__x.begin(), 
> > __x.end(),
> > +                                                       begin());
> >         return *this;
> >        }
> >
> > @@ -971,8 +971,8 @@ _GLIBCXX_BEGIN_NAMESPACE_CONTAINER
> >                 this->_M_deallocate();
> >                 _M_initialize(__x.size());
> >               }
> > -           this->_M_impl._M_finish = _M_copy_aligned(__x.begin(), 
> > __x.end(),
> > -                                                     begin());
> > +           this->_M_impl._M_finish = _M_copy_aligned_trail(__x.begin(), 
> > __x.end(),
> > +                                                             begin());
> >             __x.clear();
> >           }
> >         return *this;
> > @@ -1101,7 +1101,12 @@ _GLIBCXX_BEGIN_NAMESPACE_CONTAINER
> >        _GLIBCXX_NODISCARD _GLIBCXX20_CONSTEXPR
> >        size_type
> >        size() const _GLIBCXX_NOEXCEPT
> > -      { return size_type(end() - begin()); }
> > +      {
> > +       const size_type __sz = size_type (end() - begin());
> > +       if (__sz > max_size ())
> > +          __builtin_unreachable ();
> > +       return __sz;
> > +      }
> >
> >        _GLIBCXX_NODISCARD _GLIBCXX20_CONSTEXPR
> >        size_type
> > @@ -1119,8 +1124,12 @@ _GLIBCXX_BEGIN_NAMESPACE_CONTAINER
> >        _GLIBCXX_NODISCARD _GLIBCXX20_CONSTEXPR
> >        size_type
> >        capacity() const _GLIBCXX_NOEXCEPT
> > -      { return size_type(const_iterator(this->_M_impl._M_end_addr(), 0)
> > -                        - begin()); }
> > +      {
> > +       const size_type __sz = 
> > size_type(const_iterator(this->_M_impl._M_end_addr(), 0) - begin());
> > +       if (__sz > max_size ())
> > +          __builtin_unreachable ();
> > +       return __sz;
> > +      }
> >
> >        _GLIBCXX_NODISCARD _GLIBCXX20_CONSTEXPR
> >        bool
> > @@ -1221,7 +1230,7 @@ _GLIBCXX_BEGIN_NAMESPACE_CONTAINER
> >         if (this->_M_impl._M_finish._M_p != this->_M_impl._M_end_addr())
> >           *this->_M_impl._M_finish++ = __x;
> >         else
> > -         _M_insert_aux(end(), __x);
> > +         _M_append_aux(__x);
> >        }
> >
> >        _GLIBCXX20_CONSTEXPR
> > @@ -1484,8 +1493,19 @@ _GLIBCXX_BEGIN_NAMESPACE_CONTAINER
> >                       iterator __result)
> >        {
> >         _Bit_type* __q = std::copy(__first._M_p, __last._M_p, 
> > __result._M_p);
> > -       return std::copy(const_iterator(__last._M_p, 0), __last,
> > -                        iterator(__q, 0));
> > +       if (!__last._M_offset)
> > +         return iterator(__q, 0);
> > +       _Bit_type __mask = ~0ul << __last._M_offset;
> > +       *__q = (*__q & __mask) | (*__last._M_p & ~__mask);
> > +       return iterator(__q, __last._M_offset);
> > +      }
>
> Newline between functions please.
>
> This has the same precondition as _M_copy_aligned, right? It should
> repeat the comment above _M_copy_aligned. It would also be nice to
> have a comment saying what it does, and I agree a better name would
> help.
>
> Maybe "Copies [first,last2) to result, where last2 is the first word
> boundary not less than last" or something like that?

And maybe mention that it copies "unused" capacity in [last,last2)
which are values we don't actually care about, but are safe to copy,
and doing so is faster than shifting and masking.

>
> How about _M_copy_aligned_ceil ? Since it can be considered to do a
> "ceil" operation on the _M_last iterator, copying past its offset when
> non-zero?
>
>
> > +      _GLIBCXX20_CONSTEXPR
> > +      iterator
> > +      _M_copy_aligned_trail(const_iterator __first, const_iterator __last,
> > +                           iterator __result)
> > +      {
> > +       _Bit_type* __q = std::copy(__first._M_p, __last._M_p + 
> > (__last._M_offset != 0), __result._M_p);
> > +       return iterator(__q - (__last._M_offset != 0), __last._M_offset);
> >        }
> >
> >        _GLIBCXX20_CONSTEXPR
> > @@ -1669,6 +1689,18 @@ _GLIBCXX_BEGIN_NAMESPACE_CONTAINER
> >        void
> >        _M_insert_aux(iterator __position, bool __x);
> >
> > +      _GLIBCXX20_CONSTEXPR
> > +      void
> > +      _M_realloc_insert_aux(iterator __position, bool __x);
> > +
> > +      _GLIBCXX20_CONSTEXPR
> > +      void
> > +      _M_append_aux(bool __x);
> > +
> > +      _GLIBCXX20_CONSTEXPR
> > +      void
> > +      _M_realloc_append_aux(bool __x);
> > +
> >        _GLIBCXX20_CONSTEXPR
> >        size_type
> >        _M_check_len(size_type __n, const char* __s) const
> > diff --git a/libstdc++-v3/include/bits/vector.tcc 
> > b/libstdc++-v3/include/bits/vector.tcc
> > index 542a66173a3..f9cc78d6dcf 100644
> > --- a/libstdc++-v3/include/bits/vector.tcc
> > +++ b/libstdc++-v3/include/bits/vector.tcc
> > @@ -1182,6 +1182,24 @@ _GLIBCXX_BEGIN_NAMESPACE_CONTAINER
> >               }
> >           }
> >        }
> > +  template<typename _Alloc>
> > +    _GLIBCXX20_CONSTEXPR
> > +    void
> > +    vector<bool, _Alloc>::
> > +    _M_realloc_insert_aux(iterator __position, bool __x)
> > +    {
> > +      const size_type __len =
> > +       _M_check_len(size_type(1), "vector<bool>::_M_realloc_insert_aux");
> > +      _Bit_pointer __q = this->_M_allocate(__len);
> > +      iterator __start(std::__addressof(*__q), 0);
> > +      iterator __i = _M_copy_aligned(begin(), __position, __start);
> > +      *__i++ = __x;
> > +      iterator __finish = std::copy(__position, end(), __i);
> > +      this->_M_deallocate();
> > +      this->_M_impl._M_end_of_storage = __q + _S_nword(__len);
> > +      this->_M_impl._M_start = __start;
> > +      this->_M_impl._M_finish = __finish;
> > +    }
> >
> >    template<typename _Alloc>
> >      _GLIBCXX20_CONSTEXPR
> > @@ -1197,19 +1215,40 @@ _GLIBCXX_BEGIN_NAMESPACE_CONTAINER
> >           ++this->_M_impl._M_finish;
> >         }
> >        else
> > +       _M_realloc_insert_aux (__position, __x);
> > +    }
> > +
> > +  template<typename _Alloc>
> > +    _GLIBCXX20_CONSTEXPR
> > +    void
> > +    vector<bool, _Alloc>::
> > +    _M_realloc_append_aux(bool __x)
> > +    {
> > +      const size_type __len =
> > +       _M_check_len(size_type(1), "vector<bool>::_M_realloc_append_aux");
> > +      _Bit_pointer __q = this->_M_allocate(__len);
> > +      iterator __start(std::__addressof(*__q), 0);
> > +      iterator __i = _M_copy_aligned_trail(begin(), end(), __start);
> > +      *__i++ = __x;
> > +      this->_M_deallocate();
> > +      this->_M_impl._M_end_of_storage = __q + _S_nword(__len);
> > +      this->_M_impl._M_start = __start;
> > +      this->_M_impl._M_finish = __i;
> > +    }
> > +
> > +  template<typename _Alloc>
> > +    _GLIBCXX20_CONSTEXPR
> > +    void
> > +    vector<bool, _Alloc>::
> > +    _M_append_aux(bool __x)
> > +    {
> > +      if (this->_M_impl._M_finish._M_p != this->_M_impl._M_end_addr())
> >         {
> > -         const size_type __len =
> > -           _M_check_len(size_type(1), "vector<bool>::_M_insert_aux");
> > -         _Bit_pointer __q = this->_M_allocate(__len);
> > -         iterator __start(std::__addressof(*__q), 0);
> > -         iterator __i = _M_copy_aligned(begin(), __position, __start);
> > -         *__i++ = __x;
> > -         iterator __finish = std::copy(__position, end(), __i);
> > -         this->_M_deallocate();
> > -         this->_M_impl._M_end_of_storage = __q + _S_nword(__len);
> > -         this->_M_impl._M_start = __start;
> > -         this->_M_impl._M_finish = __finish;
> > +         *end() = __x;
> > +         ++this->_M_impl._M_finish;
> >         }
> > +      else
> > +       _M_realloc_append_aux (__x);
> >      }
> >
> >    template<typename _Alloc>
> >

Reply via email to