On Fri, 10 Jan 2025 at 14:51, Jonathan Wakely <jwak...@redhat.com> wrote: > > On Sun, 8 Dec 2024 at 15:36, Jan Hubicka <hubi...@ucw.cz> wrote: > > > > Hi, > > std::vector has independent implementation for bool which has its won > > size/capacity functions. I updated them to add __builtin_unreachable to > > announce that size is never more than max_size. However while testing the > > code > > I noticed that even construction of unused copy is not optimized out. Main > > problem is that the vector copying loops copies the tail of vector using > > loop > > that copies bit by bit. We eventually pattern match it to bit operations > > (that > > surprises me) but we need to unroll it and run through loop optimization > > that > > happens late. > > > > This patch also updates copy_aglined to use bit operations. However for = > > operation > > we can do better since it is not necessary to preserve original bits (those > > are > > undefined anyway). So I added copy_aglined_trail (better name would be > > welcome) > > that does not care about trailing bits of the copied block. > > > > As a result the following functions are optimized to empty functions: > > #include <vector> > > bool > > empty(std::vector<bool> src) > > { > > std::vector <bool> data=src; > > return false; > > } > > bool > > empty2(std::vector<bool> src) > > { > > std::vector <bool> data; > > data.reserve(1); > > return data.size (); > > } > > bool > > empty3() > > { > > std::vector <bool> data; > > data.push_back (true); > > return data[0]; > > } > > > > Finally I mirrored changes to push_back from normal vectors to bit vectors. > > This involve separating append from insert and breaking out the resizing > > (and > > cold) path to separate function. > > > > Here are two little benchmarks on push_back: > > > > #include <vector> > > > > __attribute__ ((noipa)) > > std::vector<bool> > > test() > > { > > std::vector <bool> t; > > t.push_back (1); > > t.push_back (2); > > return t; > > } > > int > > main() > > { > > for (int i = 0; i < 100000000; i++) > > { > > test(); > > } > > return 0; > > } > > > > runtime(s) .text size of > > a.out > > gcc -O2 1.04 1606 > > gcc -O2 + patch 0.98 1315 > > gcc -O3 0.98 1249 > > gcc -O3 + patch 0.96 1138 > > gcc -O3 + patch --param max-inline-insns-auto=5000 0.96 1138 > > clang -O2 1.56 1823 > > clang -O3 1.56 1839 > > clang -O2 + libc++ 2.31 4272 > > clang -O3 + libc++ 2.76 4262 > > > > push_back is still too large to inline fully at -O3. If parameters are > > bumped up > > for small vectors this makes it possible to propagate bit positions. > > This variant of benchmark > > > > #include <vector> > > > > __attribute__ ((noipa)) > > std::vector<bool> > > test() > > { > > std::vector <bool> t; > > t.push_back (1); > > t.push_back (2); > > return t; > > } > > int > > main() > > { > > for (int i = 0; i < 100000000; i++) > > { > > test(); > > } > > return 0; > > } > > > > > > runtime(s) .text size of > > a.out > > gcc -O2 1.48 1574 > > gcc -O2 + patch 1.30 1177 > > gcc -O3 1.46 1515 > > gcc -O3 + patch 1.27 1069 > > gcc -O3 + patch --param max-inline-insns-auto=5000 1.10 388 > > clang -O2 1.48 1823 > > clang -O3 1.46 1711 > > clang -O2 + libc++ 1.20 1001 > > clang -O3 + libc++ 1.17 1001 > > > > Note that clang does not suport noipa attribute, so it has some advantage > > here. > > Bootstrapped/regtested x86_64-linux, OK? > > > > libstdc++-v3/ChangeLog: > > > > * include/bits/stl_bvector.h (vector<bool, _Alloc>::operator=): Use > > _M_copy_aligned_trail. > > (vector<bool, _Alloc>::copy_aglined_trail): New function. > > Spelling > > > (vector<bool, _Alloc>::copy_aglined): Implement copying of tail > > manually. > > Spelling > > > (vector<bool, _Alloc>::size,vector<bool, _Alloc>::capacity): Add > > check > > that size is at most max_size. > > (vector<bool, _Alloc>::size,vector<bool, _Alloc>::push_back): Use > > _M_append_aux. > > (vector<bool, _Alloc>::size,vector<bool, _Alloc>::_M_append_aux): > > Declare. > > (vector<bool, _Alloc>::size,vector<bool, > > _Alloc>::_M_realloc_append_aux): Declare. > > (vector<bool, _Alloc>::size,vector<bool, > > _Alloc>::_M_realloc_insert_aux): Declare. > > * include/bits/vector.tcc > > (vector<bool, _Alloc>::size,vector<bool, _Alloc>::_M_append_aux): > > Implement. > > (vector<bool, _Alloc>::size,vector<bool, > > _Alloc>::_M_realloc_append_aux): Implement. > > (vector<bool, _Alloc>::size,vector<bool, > > _Alloc>::_M_realloc_insert_aux): Break out from ... > > (vector<bool, _Alloc>::size,vector<bool, _Alloc>::_M_insert_aux): > > ... here. > > > > gcc/testsuite/ChangeLog: > > > > * g++.dg/tree-ssa/bvector-1.C: New test. > > > > diff --git a/gcc/testsuite/g++.dg/tree-ssa/bvector-1.C > > b/gcc/testsuite/g++.dg/tree-ssa/bvector-1.C > > new file mode 100644 > > index 00000000000..4197ff8c4b8 > > --- /dev/null > > +++ b/gcc/testsuite/g++.dg/tree-ssa/bvector-1.C > > @@ -0,0 +1,28 @@ > > +// { dg-do compile } > > +// { dg-options "-O2 -fdump-tree-optimized" } > > +// { dg-skip-if "requires hosted libstdc++ for vector" { ! hostedlib } } > > +#include <vector> > > +bool > > +empty(std::vector<bool> src) > > +{ > > + std::vector <bool> data=src; > > + return false; > > +} > > +bool > > +empty2(std::vector<bool> src) > > +{ > > + std::vector <bool> data; > > + data.reserve(1); > > + return data.size (); > > +} > > +bool > > +empty3() > > +{ > > + std::vector <bool> data; > > + data.push_back (true); > > + return data[0]; > > +} > > +// All references to src should be optimized out, so there should be no > > name for it > > +// { dg-final { scan-tree-dump-not "src_" optimized } } > > +// { dg-final { scan-tree-dump-not "data_" optimized } } > > +// { dg-final { scan-tree-dump-not "new" optimized } } > > diff --git a/libstdc++-v3/include/bits/stl_bvector.h > > b/libstdc++-v3/include/bits/stl_bvector.h > > index 341eee33b21..ce1c5121b38 100644 > > --- a/libstdc++-v3/include/bits/stl_bvector.h > > +++ b/libstdc++-v3/include/bits/stl_bvector.h > > @@ -946,8 +946,8 @@ _GLIBCXX_BEGIN_NAMESPACE_CONTAINER > > this->_M_deallocate(); > > _M_initialize(__x.size()); > > } > > - this->_M_impl._M_finish = _M_copy_aligned(__x.begin(), __x.end(), > > - begin()); > > + this->_M_impl._M_finish = _M_copy_aligned_trail(__x.begin(), > > __x.end(), > > + begin()); > > return *this; > > } > > > > @@ -971,8 +971,8 @@ _GLIBCXX_BEGIN_NAMESPACE_CONTAINER > > this->_M_deallocate(); > > _M_initialize(__x.size()); > > } > > - this->_M_impl._M_finish = _M_copy_aligned(__x.begin(), > > __x.end(), > > - begin()); > > + this->_M_impl._M_finish = _M_copy_aligned_trail(__x.begin(), > > __x.end(), > > + begin()); > > __x.clear(); > > } > > return *this; > > @@ -1101,7 +1101,12 @@ _GLIBCXX_BEGIN_NAMESPACE_CONTAINER > > _GLIBCXX_NODISCARD _GLIBCXX20_CONSTEXPR > > size_type > > size() const _GLIBCXX_NOEXCEPT > > - { return size_type(end() - begin()); } > > + { > > + const size_type __sz = size_type (end() - begin()); > > + if (__sz > max_size ()) > > + __builtin_unreachable (); > > + return __sz; > > + } > > > > _GLIBCXX_NODISCARD _GLIBCXX20_CONSTEXPR > > size_type > > @@ -1119,8 +1124,12 @@ _GLIBCXX_BEGIN_NAMESPACE_CONTAINER > > _GLIBCXX_NODISCARD _GLIBCXX20_CONSTEXPR > > size_type > > capacity() const _GLIBCXX_NOEXCEPT > > - { return size_type(const_iterator(this->_M_impl._M_end_addr(), 0) > > - - begin()); } > > + { > > + const size_type __sz = > > size_type(const_iterator(this->_M_impl._M_end_addr(), 0) - begin()); > > + if (__sz > max_size ()) > > + __builtin_unreachable (); > > + return __sz; > > + } > > > > _GLIBCXX_NODISCARD _GLIBCXX20_CONSTEXPR > > bool > > @@ -1221,7 +1230,7 @@ _GLIBCXX_BEGIN_NAMESPACE_CONTAINER > > if (this->_M_impl._M_finish._M_p != this->_M_impl._M_end_addr()) > > *this->_M_impl._M_finish++ = __x; > > else > > - _M_insert_aux(end(), __x); > > + _M_append_aux(__x); > > } > > > > _GLIBCXX20_CONSTEXPR > > @@ -1484,8 +1493,19 @@ _GLIBCXX_BEGIN_NAMESPACE_CONTAINER > > iterator __result) > > { > > _Bit_type* __q = std::copy(__first._M_p, __last._M_p, > > __result._M_p); > > - return std::copy(const_iterator(__last._M_p, 0), __last, > > - iterator(__q, 0)); > > + if (!__last._M_offset) > > + return iterator(__q, 0); > > + _Bit_type __mask = ~0ul << __last._M_offset; > > + *__q = (*__q & __mask) | (*__last._M_p & ~__mask); > > + return iterator(__q, __last._M_offset); > > + } > > Newline between functions please. > > This has the same precondition as _M_copy_aligned, right? It should > repeat the comment above _M_copy_aligned. It would also be nice to > have a comment saying what it does, and I agree a better name would > help. > > Maybe "Copies [first,last2) to result, where last2 is the first word > boundary not less than last" or something like that?
And maybe mention that it copies "unused" capacity in [last,last2) which are values we don't actually care about, but are safe to copy, and doing so is faster than shifting and masking. > > How about _M_copy_aligned_ceil ? Since it can be considered to do a > "ceil" operation on the _M_last iterator, copying past its offset when > non-zero? > > > > + _GLIBCXX20_CONSTEXPR > > + iterator > > + _M_copy_aligned_trail(const_iterator __first, const_iterator __last, > > + iterator __result) > > + { > > + _Bit_type* __q = std::copy(__first._M_p, __last._M_p + > > (__last._M_offset != 0), __result._M_p); > > + return iterator(__q - (__last._M_offset != 0), __last._M_offset); > > } > > > > _GLIBCXX20_CONSTEXPR > > @@ -1669,6 +1689,18 @@ _GLIBCXX_BEGIN_NAMESPACE_CONTAINER > > void > > _M_insert_aux(iterator __position, bool __x); > > > > + _GLIBCXX20_CONSTEXPR > > + void > > + _M_realloc_insert_aux(iterator __position, bool __x); > > + > > + _GLIBCXX20_CONSTEXPR > > + void > > + _M_append_aux(bool __x); > > + > > + _GLIBCXX20_CONSTEXPR > > + void > > + _M_realloc_append_aux(bool __x); > > + > > _GLIBCXX20_CONSTEXPR > > size_type > > _M_check_len(size_type __n, const char* __s) const > > diff --git a/libstdc++-v3/include/bits/vector.tcc > > b/libstdc++-v3/include/bits/vector.tcc > > index 542a66173a3..f9cc78d6dcf 100644 > > --- a/libstdc++-v3/include/bits/vector.tcc > > +++ b/libstdc++-v3/include/bits/vector.tcc > > @@ -1182,6 +1182,24 @@ _GLIBCXX_BEGIN_NAMESPACE_CONTAINER > > } > > } > > } > > + template<typename _Alloc> > > + _GLIBCXX20_CONSTEXPR > > + void > > + vector<bool, _Alloc>:: > > + _M_realloc_insert_aux(iterator __position, bool __x) > > + { > > + const size_type __len = > > + _M_check_len(size_type(1), "vector<bool>::_M_realloc_insert_aux"); > > + _Bit_pointer __q = this->_M_allocate(__len); > > + iterator __start(std::__addressof(*__q), 0); > > + iterator __i = _M_copy_aligned(begin(), __position, __start); > > + *__i++ = __x; > > + iterator __finish = std::copy(__position, end(), __i); > > + this->_M_deallocate(); > > + this->_M_impl._M_end_of_storage = __q + _S_nword(__len); > > + this->_M_impl._M_start = __start; > > + this->_M_impl._M_finish = __finish; > > + } > > > > template<typename _Alloc> > > _GLIBCXX20_CONSTEXPR > > @@ -1197,19 +1215,40 @@ _GLIBCXX_BEGIN_NAMESPACE_CONTAINER > > ++this->_M_impl._M_finish; > > } > > else > > + _M_realloc_insert_aux (__position, __x); > > + } > > + > > + template<typename _Alloc> > > + _GLIBCXX20_CONSTEXPR > > + void > > + vector<bool, _Alloc>:: > > + _M_realloc_append_aux(bool __x) > > + { > > + const size_type __len = > > + _M_check_len(size_type(1), "vector<bool>::_M_realloc_append_aux"); > > + _Bit_pointer __q = this->_M_allocate(__len); > > + iterator __start(std::__addressof(*__q), 0); > > + iterator __i = _M_copy_aligned_trail(begin(), end(), __start); > > + *__i++ = __x; > > + this->_M_deallocate(); > > + this->_M_impl._M_end_of_storage = __q + _S_nword(__len); > > + this->_M_impl._M_start = __start; > > + this->_M_impl._M_finish = __i; > > + } > > + > > + template<typename _Alloc> > > + _GLIBCXX20_CONSTEXPR > > + void > > + vector<bool, _Alloc>:: > > + _M_append_aux(bool __x) > > + { > > + if (this->_M_impl._M_finish._M_p != this->_M_impl._M_end_addr()) > > { > > - const size_type __len = > > - _M_check_len(size_type(1), "vector<bool>::_M_insert_aux"); > > - _Bit_pointer __q = this->_M_allocate(__len); > > - iterator __start(std::__addressof(*__q), 0); > > - iterator __i = _M_copy_aligned(begin(), __position, __start); > > - *__i++ = __x; > > - iterator __finish = std::copy(__position, end(), __i); > > - this->_M_deallocate(); > > - this->_M_impl._M_end_of_storage = __q + _S_nword(__len); > > - this->_M_impl._M_start = __start; > > - this->_M_impl._M_finish = __finish; > > + *end() = __x; > > + ++this->_M_impl._M_finish; > > } > > + else > > + _M_realloc_append_aux (__x); > > } > > > > template<typename _Alloc> > >