Hi, std::vector has independent implementation for bool which has its won size/capacity functions. I updated them to add __builtin_unreachable to announce that size is never more than max_size. However while testing the code I noticed that even construction of unused copy is not optimized out. Main problem is that the vector copying loops copies the tail of vector using loop that copies bit by bit. We eventually pattern match it to bit operations (that surprises me) but we need to unroll it and run through loop optimization that happens late.
This patch also updates copy_aglined to use bit operations. However for = operation we can do better since it is not necessary to preserve original bits (those are undefined anyway). So I added copy_aglined_trail (better name would be welcome) that does not care about trailing bits of the copied block. As a result the following functions are optimized to empty functions: #include <vector> bool empty(std::vector<bool> src) { std::vector <bool> data=src; return false; } bool empty2(std::vector<bool> src) { std::vector <bool> data; data.reserve(1); return data.size (); } bool empty3() { std::vector <bool> data; data.push_back (true); return data[0]; } Finally I mirrored changes to push_back from normal vectors to bit vectors. This involve separating append from insert and breaking out the resizing (and cold) path to separate function. Here are two little benchmarks on push_back: #include <vector> __attribute__ ((noipa)) std::vector<bool> test() { std::vector <bool> t; t.push_back (1); t.push_back (2); return t; } int main() { for (int i = 0; i < 100000000; i++) { test(); } return 0; } runtime(s) .text size of a.out gcc -O2 1.04 1606 gcc -O2 + patch 0.98 1315 gcc -O3 0.98 1249 gcc -O3 + patch 0.96 1138 gcc -O3 + patch --param max-inline-insns-auto=5000 0.96 1138 clang -O2 1.56 1823 clang -O3 1.56 1839 clang -O2 + libc++ 2.31 4272 clang -O3 + libc++ 2.76 4262 push_back is still too large to inline fully at -O3. If parameters are bumped up for small vectors this makes it possible to propagate bit positions. This variant of benchmark #include <vector> __attribute__ ((noipa)) std::vector<bool> test() { std::vector <bool> t; t.push_back (1); t.push_back (2); return t; } int main() { for (int i = 0; i < 100000000; i++) { test(); } return 0; } runtime(s) .text size of a.out gcc -O2 1.48 1574 gcc -O2 + patch 1.30 1177 gcc -O3 1.46 1515 gcc -O3 + patch 1.27 1069 gcc -O3 + patch --param max-inline-insns-auto=5000 1.10 388 clang -O2 1.48 1823 clang -O3 1.46 1711 clang -O2 + libc++ 1.20 1001 clang -O3 + libc++ 1.17 1001 Note that clang does not suport noipa attribute, so it has some advantage here. Bootstrapped/regtested x86_64-linux, OK? libstdc++-v3/ChangeLog: * include/bits/stl_bvector.h (vector<bool, _Alloc>::operator=): Use _M_copy_aligned_trail. (vector<bool, _Alloc>::copy_aglined_trail): New function. (vector<bool, _Alloc>::copy_aglined): Implement copying of tail manually. (vector<bool, _Alloc>::size,vector<bool, _Alloc>::capacity): Add check that size is at most max_size. (vector<bool, _Alloc>::size,vector<bool, _Alloc>::push_back): Use _M_append_aux. (vector<bool, _Alloc>::size,vector<bool, _Alloc>::_M_append_aux): Declare. (vector<bool, _Alloc>::size,vector<bool, _Alloc>::_M_realloc_append_aux): Declare. (vector<bool, _Alloc>::size,vector<bool, _Alloc>::_M_realloc_insert_aux): Declare. * include/bits/vector.tcc (vector<bool, _Alloc>::size,vector<bool, _Alloc>::_M_append_aux): Implement. (vector<bool, _Alloc>::size,vector<bool, _Alloc>::_M_realloc_append_aux): Implement. (vector<bool, _Alloc>::size,vector<bool, _Alloc>::_M_realloc_insert_aux): Break out from ... (vector<bool, _Alloc>::size,vector<bool, _Alloc>::_M_insert_aux): ... here. gcc/testsuite/ChangeLog: * g++.dg/tree-ssa/bvector-1.C: New test. diff --git a/gcc/testsuite/g++.dg/tree-ssa/bvector-1.C b/gcc/testsuite/g++.dg/tree-ssa/bvector-1.C new file mode 100644 index 00000000000..4197ff8c4b8 --- /dev/null +++ b/gcc/testsuite/g++.dg/tree-ssa/bvector-1.C @@ -0,0 +1,28 @@ +// { dg-do compile } +// { dg-options "-O2 -fdump-tree-optimized" } +// { dg-skip-if "requires hosted libstdc++ for vector" { ! hostedlib } } +#include <vector> +bool +empty(std::vector<bool> src) +{ + std::vector <bool> data=src; + return false; +} +bool +empty2(std::vector<bool> src) +{ + std::vector <bool> data; + data.reserve(1); + return data.size (); +} +bool +empty3() +{ + std::vector <bool> data; + data.push_back (true); + return data[0]; +} +// All references to src should be optimized out, so there should be no name for it +// { dg-final { scan-tree-dump-not "src_" optimized } } +// { dg-final { scan-tree-dump-not "data_" optimized } } +// { dg-final { scan-tree-dump-not "new" optimized } } diff --git a/libstdc++-v3/include/bits/stl_bvector.h b/libstdc++-v3/include/bits/stl_bvector.h index 341eee33b21..ce1c5121b38 100644 --- a/libstdc++-v3/include/bits/stl_bvector.h +++ b/libstdc++-v3/include/bits/stl_bvector.h @@ -946,8 +946,8 @@ _GLIBCXX_BEGIN_NAMESPACE_CONTAINER this->_M_deallocate(); _M_initialize(__x.size()); } - this->_M_impl._M_finish = _M_copy_aligned(__x.begin(), __x.end(), - begin()); + this->_M_impl._M_finish = _M_copy_aligned_trail(__x.begin(), __x.end(), + begin()); return *this; } @@ -971,8 +971,8 @@ _GLIBCXX_BEGIN_NAMESPACE_CONTAINER this->_M_deallocate(); _M_initialize(__x.size()); } - this->_M_impl._M_finish = _M_copy_aligned(__x.begin(), __x.end(), - begin()); + this->_M_impl._M_finish = _M_copy_aligned_trail(__x.begin(), __x.end(), + begin()); __x.clear(); } return *this; @@ -1101,7 +1101,12 @@ _GLIBCXX_BEGIN_NAMESPACE_CONTAINER _GLIBCXX_NODISCARD _GLIBCXX20_CONSTEXPR size_type size() const _GLIBCXX_NOEXCEPT - { return size_type(end() - begin()); } + { + const size_type __sz = size_type (end() - begin()); + if (__sz > max_size ()) + __builtin_unreachable (); + return __sz; + } _GLIBCXX_NODISCARD _GLIBCXX20_CONSTEXPR size_type @@ -1119,8 +1124,12 @@ _GLIBCXX_BEGIN_NAMESPACE_CONTAINER _GLIBCXX_NODISCARD _GLIBCXX20_CONSTEXPR size_type capacity() const _GLIBCXX_NOEXCEPT - { return size_type(const_iterator(this->_M_impl._M_end_addr(), 0) - - begin()); } + { + const size_type __sz = size_type(const_iterator(this->_M_impl._M_end_addr(), 0) - begin()); + if (__sz > max_size ()) + __builtin_unreachable (); + return __sz; + } _GLIBCXX_NODISCARD _GLIBCXX20_CONSTEXPR bool @@ -1221,7 +1230,7 @@ _GLIBCXX_BEGIN_NAMESPACE_CONTAINER if (this->_M_impl._M_finish._M_p != this->_M_impl._M_end_addr()) *this->_M_impl._M_finish++ = __x; else - _M_insert_aux(end(), __x); + _M_append_aux(__x); } _GLIBCXX20_CONSTEXPR @@ -1484,8 +1493,19 @@ _GLIBCXX_BEGIN_NAMESPACE_CONTAINER iterator __result) { _Bit_type* __q = std::copy(__first._M_p, __last._M_p, __result._M_p); - return std::copy(const_iterator(__last._M_p, 0), __last, - iterator(__q, 0)); + if (!__last._M_offset) + return iterator(__q, 0); + _Bit_type __mask = ~0ul << __last._M_offset; + *__q = (*__q & __mask) | (*__last._M_p & ~__mask); + return iterator(__q, __last._M_offset); + } + _GLIBCXX20_CONSTEXPR + iterator + _M_copy_aligned_trail(const_iterator __first, const_iterator __last, + iterator __result) + { + _Bit_type* __q = std::copy(__first._M_p, __last._M_p + (__last._M_offset != 0), __result._M_p); + return iterator(__q - (__last._M_offset != 0), __last._M_offset); } _GLIBCXX20_CONSTEXPR @@ -1669,6 +1689,18 @@ _GLIBCXX_BEGIN_NAMESPACE_CONTAINER void _M_insert_aux(iterator __position, bool __x); + _GLIBCXX20_CONSTEXPR + void + _M_realloc_insert_aux(iterator __position, bool __x); + + _GLIBCXX20_CONSTEXPR + void + _M_append_aux(bool __x); + + _GLIBCXX20_CONSTEXPR + void + _M_realloc_append_aux(bool __x); + _GLIBCXX20_CONSTEXPR size_type _M_check_len(size_type __n, const char* __s) const diff --git a/libstdc++-v3/include/bits/vector.tcc b/libstdc++-v3/include/bits/vector.tcc index 542a66173a3..f9cc78d6dcf 100644 --- a/libstdc++-v3/include/bits/vector.tcc +++ b/libstdc++-v3/include/bits/vector.tcc @@ -1182,6 +1182,24 @@ _GLIBCXX_BEGIN_NAMESPACE_CONTAINER } } } + template<typename _Alloc> + _GLIBCXX20_CONSTEXPR + void + vector<bool, _Alloc>:: + _M_realloc_insert_aux(iterator __position, bool __x) + { + const size_type __len = + _M_check_len(size_type(1), "vector<bool>::_M_realloc_insert_aux"); + _Bit_pointer __q = this->_M_allocate(__len); + iterator __start(std::__addressof(*__q), 0); + iterator __i = _M_copy_aligned(begin(), __position, __start); + *__i++ = __x; + iterator __finish = std::copy(__position, end(), __i); + this->_M_deallocate(); + this->_M_impl._M_end_of_storage = __q + _S_nword(__len); + this->_M_impl._M_start = __start; + this->_M_impl._M_finish = __finish; + } template<typename _Alloc> _GLIBCXX20_CONSTEXPR @@ -1197,19 +1215,40 @@ _GLIBCXX_BEGIN_NAMESPACE_CONTAINER ++this->_M_impl._M_finish; } else + _M_realloc_insert_aux (__position, __x); + } + + template<typename _Alloc> + _GLIBCXX20_CONSTEXPR + void + vector<bool, _Alloc>:: + _M_realloc_append_aux(bool __x) + { + const size_type __len = + _M_check_len(size_type(1), "vector<bool>::_M_realloc_append_aux"); + _Bit_pointer __q = this->_M_allocate(__len); + iterator __start(std::__addressof(*__q), 0); + iterator __i = _M_copy_aligned_trail(begin(), end(), __start); + *__i++ = __x; + this->_M_deallocate(); + this->_M_impl._M_end_of_storage = __q + _S_nword(__len); + this->_M_impl._M_start = __start; + this->_M_impl._M_finish = __i; + } + + template<typename _Alloc> + _GLIBCXX20_CONSTEXPR + void + vector<bool, _Alloc>:: + _M_append_aux(bool __x) + { + if (this->_M_impl._M_finish._M_p != this->_M_impl._M_end_addr()) { - const size_type __len = - _M_check_len(size_type(1), "vector<bool>::_M_insert_aux"); - _Bit_pointer __q = this->_M_allocate(__len); - iterator __start(std::__addressof(*__q), 0); - iterator __i = _M_copy_aligned(begin(), __position, __start); - *__i++ = __x; - iterator __finish = std::copy(__position, end(), __i); - this->_M_deallocate(); - this->_M_impl._M_end_of_storage = __q + _S_nword(__len); - this->_M_impl._M_start = __start; - this->_M_impl._M_finish = __finish; + *end() = __x; + ++this->_M_impl._M_finish; } + else + _M_realloc_append_aux (__x); } template<typename _Alloc>