On Sun, 8 Dec 2024 at 15:36, Jan Hubicka <hubi...@ucw.cz> wrote: > > Hi, > std::vector has independent implementation for bool which has its won > size/capacity functions. I updated them to add __builtin_unreachable to > announce that size is never more than max_size. However while testing the > code > I noticed that even construction of unused copy is not optimized out. Main > problem is that the vector copying loops copies the tail of vector using loop > that copies bit by bit. We eventually pattern match it to bit operations > (that > surprises me) but we need to unroll it and run through loop optimization that > happens late. > > This patch also updates copy_aglined to use bit operations. However for = > operation > we can do better since it is not necessary to preserve original bits (those > are > undefined anyway). So I added copy_aglined_trail (better name would be > welcome) > that does not care about trailing bits of the copied block. > > As a result the following functions are optimized to empty functions: > #include <vector> > bool > empty(std::vector<bool> src) > { > std::vector <bool> data=src; > return false; > } > bool > empty2(std::vector<bool> src) > { > std::vector <bool> data; > data.reserve(1); > return data.size (); > } > bool > empty3() > { > std::vector <bool> data; > data.push_back (true); > return data[0]; > } > > Finally I mirrored changes to push_back from normal vectors to bit vectors. > This involve separating append from insert and breaking out the resizing (and > cold) path to separate function. > > Here are two little benchmarks on push_back: > > #include <vector> > > __attribute__ ((noipa)) > std::vector<bool> > test() > { > std::vector <bool> t; > t.push_back (1); > t.push_back (2); > return t; > } > int > main() > { > for (int i = 0; i < 100000000; i++) > { > test(); > } > return 0; > } > > runtime(s) .text size of > a.out > gcc -O2 1.04 1606 > gcc -O2 + patch 0.98 1315 > gcc -O3 0.98 1249 > gcc -O3 + patch 0.96 1138 > gcc -O3 + patch --param max-inline-insns-auto=5000 0.96 1138 > clang -O2 1.56 1823 > clang -O3 1.56 1839 > clang -O2 + libc++ 2.31 4272 > clang -O3 + libc++ 2.76 4262 > > push_back is still too large to inline fully at -O3. If parameters are > bumped up > for small vectors this makes it possible to propagate bit positions. > This variant of benchmark > > #include <vector> > > __attribute__ ((noipa)) > std::vector<bool> > test() > { > std::vector <bool> t; > t.push_back (1); > t.push_back (2); > return t; > } > int > main() > { > for (int i = 0; i < 100000000; i++) > { > test(); > } > return 0; > } > > > runtime(s) .text size of > a.out > gcc -O2 1.48 1574 > gcc -O2 + patch 1.30 1177 > gcc -O3 1.46 1515 > gcc -O3 + patch 1.27 1069 > gcc -O3 + patch --param max-inline-insns-auto=5000 1.10 388 > clang -O2 1.48 1823 > clang -O3 1.46 1711 > clang -O2 + libc++ 1.20 1001 > clang -O3 + libc++ 1.17 1001 > > Note that clang does not suport noipa attribute, so it has some advantage > here. > Bootstrapped/regtested x86_64-linux, OK? > > libstdc++-v3/ChangeLog: > > * include/bits/stl_bvector.h (vector<bool, _Alloc>::operator=): Use > _M_copy_aligned_trail. > (vector<bool, _Alloc>::copy_aglined_trail): New function.
Spelling > (vector<bool, _Alloc>::copy_aglined): Implement copying of tail > manually. Spelling > (vector<bool, _Alloc>::size,vector<bool, _Alloc>::capacity): Add check > that size is at most max_size. > (vector<bool, _Alloc>::size,vector<bool, _Alloc>::push_back): Use > _M_append_aux. > (vector<bool, _Alloc>::size,vector<bool, _Alloc>::_M_append_aux): > Declare. > (vector<bool, _Alloc>::size,vector<bool, > _Alloc>::_M_realloc_append_aux): Declare. > (vector<bool, _Alloc>::size,vector<bool, > _Alloc>::_M_realloc_insert_aux): Declare. > * include/bits/vector.tcc > (vector<bool, _Alloc>::size,vector<bool, _Alloc>::_M_append_aux): > Implement. > (vector<bool, _Alloc>::size,vector<bool, > _Alloc>::_M_realloc_append_aux): Implement. > (vector<bool, _Alloc>::size,vector<bool, > _Alloc>::_M_realloc_insert_aux): Break out from ... > (vector<bool, _Alloc>::size,vector<bool, _Alloc>::_M_insert_aux): ... > here. > > gcc/testsuite/ChangeLog: > > * g++.dg/tree-ssa/bvector-1.C: New test. > > diff --git a/gcc/testsuite/g++.dg/tree-ssa/bvector-1.C > b/gcc/testsuite/g++.dg/tree-ssa/bvector-1.C > new file mode 100644 > index 00000000000..4197ff8c4b8 > --- /dev/null > +++ b/gcc/testsuite/g++.dg/tree-ssa/bvector-1.C > @@ -0,0 +1,28 @@ > +// { dg-do compile } > +// { dg-options "-O2 -fdump-tree-optimized" } > +// { dg-skip-if "requires hosted libstdc++ for vector" { ! hostedlib } } > +#include <vector> > +bool > +empty(std::vector<bool> src) > +{ > + std::vector <bool> data=src; > + return false; > +} > +bool > +empty2(std::vector<bool> src) > +{ > + std::vector <bool> data; > + data.reserve(1); > + return data.size (); > +} > +bool > +empty3() > +{ > + std::vector <bool> data; > + data.push_back (true); > + return data[0]; > +} > +// All references to src should be optimized out, so there should be no name > for it > +// { dg-final { scan-tree-dump-not "src_" optimized } } > +// { dg-final { scan-tree-dump-not "data_" optimized } } > +// { dg-final { scan-tree-dump-not "new" optimized } } > diff --git a/libstdc++-v3/include/bits/stl_bvector.h > b/libstdc++-v3/include/bits/stl_bvector.h > index 341eee33b21..ce1c5121b38 100644 > --- a/libstdc++-v3/include/bits/stl_bvector.h > +++ b/libstdc++-v3/include/bits/stl_bvector.h > @@ -946,8 +946,8 @@ _GLIBCXX_BEGIN_NAMESPACE_CONTAINER > this->_M_deallocate(); > _M_initialize(__x.size()); > } > - this->_M_impl._M_finish = _M_copy_aligned(__x.begin(), __x.end(), > - begin()); > + this->_M_impl._M_finish = _M_copy_aligned_trail(__x.begin(), > __x.end(), > + begin()); > return *this; > } > > @@ -971,8 +971,8 @@ _GLIBCXX_BEGIN_NAMESPACE_CONTAINER > this->_M_deallocate(); > _M_initialize(__x.size()); > } > - this->_M_impl._M_finish = _M_copy_aligned(__x.begin(), __x.end(), > - begin()); > + this->_M_impl._M_finish = _M_copy_aligned_trail(__x.begin(), > __x.end(), > + begin()); > __x.clear(); > } > return *this; > @@ -1101,7 +1101,12 @@ _GLIBCXX_BEGIN_NAMESPACE_CONTAINER > _GLIBCXX_NODISCARD _GLIBCXX20_CONSTEXPR > size_type > size() const _GLIBCXX_NOEXCEPT > - { return size_type(end() - begin()); } > + { > + const size_type __sz = size_type (end() - begin()); > + if (__sz > max_size ()) > + __builtin_unreachable (); > + return __sz; > + } > > _GLIBCXX_NODISCARD _GLIBCXX20_CONSTEXPR > size_type > @@ -1119,8 +1124,12 @@ _GLIBCXX_BEGIN_NAMESPACE_CONTAINER > _GLIBCXX_NODISCARD _GLIBCXX20_CONSTEXPR > size_type > capacity() const _GLIBCXX_NOEXCEPT > - { return size_type(const_iterator(this->_M_impl._M_end_addr(), 0) > - - begin()); } > + { > + const size_type __sz = > size_type(const_iterator(this->_M_impl._M_end_addr(), 0) - begin()); > + if (__sz > max_size ()) > + __builtin_unreachable (); > + return __sz; > + } > > _GLIBCXX_NODISCARD _GLIBCXX20_CONSTEXPR > bool > @@ -1221,7 +1230,7 @@ _GLIBCXX_BEGIN_NAMESPACE_CONTAINER > if (this->_M_impl._M_finish._M_p != this->_M_impl._M_end_addr()) > *this->_M_impl._M_finish++ = __x; > else > - _M_insert_aux(end(), __x); > + _M_append_aux(__x); > } > > _GLIBCXX20_CONSTEXPR > @@ -1484,8 +1493,19 @@ _GLIBCXX_BEGIN_NAMESPACE_CONTAINER > iterator __result) > { > _Bit_type* __q = std::copy(__first._M_p, __last._M_p, __result._M_p); > - return std::copy(const_iterator(__last._M_p, 0), __last, > - iterator(__q, 0)); > + if (!__last._M_offset) > + return iterator(__q, 0); > + _Bit_type __mask = ~0ul << __last._M_offset; > + *__q = (*__q & __mask) | (*__last._M_p & ~__mask); > + return iterator(__q, __last._M_offset); > + } Newline between functions please. This has the same precondition as _M_copy_aligned, right? It should repeat the comment above _M_copy_aligned. It would also be nice to have a comment saying what it does, and I agree a better name would help. Maybe "Copies [first,last2) to result, where last2 is the first word boundary not less than last" or something like that? How about _M_copy_aligned_ceil ? Since it can be considered to do a "ceil" operation on the _M_last iterator, copying past its offset when non-zero? > + _GLIBCXX20_CONSTEXPR > + iterator > + _M_copy_aligned_trail(const_iterator __first, const_iterator __last, > + iterator __result) > + { > + _Bit_type* __q = std::copy(__first._M_p, __last._M_p + > (__last._M_offset != 0), __result._M_p); > + return iterator(__q - (__last._M_offset != 0), __last._M_offset); > } > > _GLIBCXX20_CONSTEXPR > @@ -1669,6 +1689,18 @@ _GLIBCXX_BEGIN_NAMESPACE_CONTAINER > void > _M_insert_aux(iterator __position, bool __x); > > + _GLIBCXX20_CONSTEXPR > + void > + _M_realloc_insert_aux(iterator __position, bool __x); > + > + _GLIBCXX20_CONSTEXPR > + void > + _M_append_aux(bool __x); > + > + _GLIBCXX20_CONSTEXPR > + void > + _M_realloc_append_aux(bool __x); > + > _GLIBCXX20_CONSTEXPR > size_type > _M_check_len(size_type __n, const char* __s) const > diff --git a/libstdc++-v3/include/bits/vector.tcc > b/libstdc++-v3/include/bits/vector.tcc > index 542a66173a3..f9cc78d6dcf 100644 > --- a/libstdc++-v3/include/bits/vector.tcc > +++ b/libstdc++-v3/include/bits/vector.tcc > @@ -1182,6 +1182,24 @@ _GLIBCXX_BEGIN_NAMESPACE_CONTAINER > } > } > } > + template<typename _Alloc> > + _GLIBCXX20_CONSTEXPR > + void > + vector<bool, _Alloc>:: > + _M_realloc_insert_aux(iterator __position, bool __x) > + { > + const size_type __len = > + _M_check_len(size_type(1), "vector<bool>::_M_realloc_insert_aux"); > + _Bit_pointer __q = this->_M_allocate(__len); > + iterator __start(std::__addressof(*__q), 0); > + iterator __i = _M_copy_aligned(begin(), __position, __start); > + *__i++ = __x; > + iterator __finish = std::copy(__position, end(), __i); > + this->_M_deallocate(); > + this->_M_impl._M_end_of_storage = __q + _S_nword(__len); > + this->_M_impl._M_start = __start; > + this->_M_impl._M_finish = __finish; > + } > > template<typename _Alloc> > _GLIBCXX20_CONSTEXPR > @@ -1197,19 +1215,40 @@ _GLIBCXX_BEGIN_NAMESPACE_CONTAINER > ++this->_M_impl._M_finish; > } > else > + _M_realloc_insert_aux (__position, __x); > + } > + > + template<typename _Alloc> > + _GLIBCXX20_CONSTEXPR > + void > + vector<bool, _Alloc>:: > + _M_realloc_append_aux(bool __x) > + { > + const size_type __len = > + _M_check_len(size_type(1), "vector<bool>::_M_realloc_append_aux"); > + _Bit_pointer __q = this->_M_allocate(__len); > + iterator __start(std::__addressof(*__q), 0); > + iterator __i = _M_copy_aligned_trail(begin(), end(), __start); > + *__i++ = __x; > + this->_M_deallocate(); > + this->_M_impl._M_end_of_storage = __q + _S_nword(__len); > + this->_M_impl._M_start = __start; > + this->_M_impl._M_finish = __i; > + } > + > + template<typename _Alloc> > + _GLIBCXX20_CONSTEXPR > + void > + vector<bool, _Alloc>:: > + _M_append_aux(bool __x) > + { > + if (this->_M_impl._M_finish._M_p != this->_M_impl._M_end_addr()) > { > - const size_type __len = > - _M_check_len(size_type(1), "vector<bool>::_M_insert_aux"); > - _Bit_pointer __q = this->_M_allocate(__len); > - iterator __start(std::__addressof(*__q), 0); > - iterator __i = _M_copy_aligned(begin(), __position, __start); > - *__i++ = __x; > - iterator __finish = std::copy(__position, end(), __i); > - this->_M_deallocate(); > - this->_M_impl._M_end_of_storage = __q + _S_nword(__len); > - this->_M_impl._M_start = __start; > - this->_M_impl._M_finish = __finish; > + *end() = __x; > + ++this->_M_impl._M_finish; > } > + else > + _M_realloc_append_aux (__x); > } > > template<typename _Alloc> >