I started considering PR libstdc++/68303.
First thing was to write a dedicated performance test case, it is the
unordered_small_size.cc I'd like to add with this patch.
The first runs show a major difference between tr1 and std
implementations, tr1 being much better:
std::tr1::unordered_set<int> without hash code cached: 1st insert
9r 9u 1s 14725920mem 0pf
std::tr1::unordered_set<int> with hash code cached: 1st insert
8r 9u 0s 14719680mem 0pf
std::unordered_set<int> without hash code cached: 1st insert 17r
17u 0s 16640080mem 0pf
std::unordered_set<int> with hash code cached: 1st insert 14r
14u 0s 16638656mem 0pf
I had a look in gdb to find out why and the answer was quite obvious.
For 20 insertions tr1 implementation bucket count goes through [11, 23]
whereas for std it is [2, 5, 11, 23], so 2 more expensive rehash.
As unordered containers are dedicated to rather important number of
elements I propose to review the rehash policy with this patch so that
std also starts at 11 on the 1st insertion. After the patch figures are:
std::tr1::unordered_set<int> without hash code cached: 1st insert
9r 9u 0s 14725920mem 0pf
std::tr1::unordered_set<int> with hash code cached: 1st insert
8r 8u 0s 14719680mem 0pf
std::unordered_set<int> without hash code cached: 1st insert 15r
15u 0s 16640128mem 0pf
std::unordered_set<int> with hash code cached: 1st insert 12r
12u 0s 16638688mem 0pf
Moreover I noticed that performance tests are built with -O2, is that
intentional ? The std implementation uses more abstractions than tr1,
looks like building with -O3 optimizes away most of those abstractions
making tr1 and std implementation much closer:
std::tr1::unordered_set<int> without hash code cached: 1st insert
2r 1u 1s 14725952mem 0pf
std::tr1::unordered_set<int> with hash code cached: 1st insert
2r 1u 0s 14719536mem 0pf
std::unordered_set<int> without hash code cached: 1st insert 2r
2u 0s 16640064mem 0pf
std::unordered_set<int> with hash code cached: 1st insert 2r
2u 0s 16638608mem 0pf
Note that this patch also rework the alternative rehash policy based on
powers of 2 so that it also starts with a larger number of bucket (16)
and respects LWG2156.
Last I had to wider the memory column so that alignment is preserved
even when memory diff is negative.
Tested under Linux x86_64.
Ok to commit ?
François
diff --git a/libstdc++-v3/include/bits/hashtable_policy.h b/libstdc++-v3/include/bits/hashtable_policy.h
index 66fbfbe5f21..70d3c5c8194 100644
--- a/libstdc++-v3/include/bits/hashtable_policy.h
+++ b/libstdc++-v3/include/bits/hashtable_policy.h
@@ -536,6 +536,13 @@ namespace __detail
std::size_t
_M_next_bkt(std::size_t __n) noexcept
{
+ if (__n < 0x11)
+ {
+ _M_next_resize =
+ __builtin_floor(0x10 * (long double)_M_max_load_factor);
+ return 0x10;
+ }
+
const auto __max_width = std::min<size_t>(sizeof(size_t), 8);
const auto __max_bkt = size_t(1) << (__max_width * __CHAR_BIT__ - 1);
std::size_t __res = __clp2(__n);
@@ -553,7 +560,7 @@ namespace __detail
_M_next_resize = std::size_t(-1);
else
_M_next_resize
- = __builtin_ceil(__res * (long double)_M_max_load_factor);
+ = __builtin_floor(__res * (long double)_M_max_load_factor);
return __res;
}
@@ -571,7 +578,7 @@ namespace __detail
_M_need_rehash(std::size_t __n_bkt, std::size_t __n_elt,
std::size_t __n_ins) noexcept
{
- if (__n_elt + __n_ins >= _M_next_resize)
+ if (__n_elt + __n_ins > _M_next_resize)
{
long double __min_bkts = (__n_elt + __n_ins)
/ (long double)_M_max_load_factor;
diff --git a/libstdc++-v3/src/c++11/hashtable_c++0x.cc b/libstdc++-v3/src/c++11/hashtable_c++0x.cc
index b9b11ff4385..6df9775b954 100644
--- a/libstdc++-v3/src/c++11/hashtable_c++0x.cc
+++ b/libstdc++-v3/src/c++11/hashtable_c++0x.cc
@@ -46,14 +46,11 @@ namespace __detail
{
// Optimize lookups involving the first elements of __prime_list.
// (useful to speed-up, eg, constructors)
- static const unsigned char __fast_bkt[]
- = { 2, 2, 2, 3, 5, 5, 7, 7, 11, 11, 11, 11, 13, 13 };
-
- if (__n < sizeof(__fast_bkt))
+ if (__n < 12)
{
_M_next_resize =
- __builtin_floor(__fast_bkt[__n] * (long double)_M_max_load_factor);
- return __fast_bkt[__n];
+ __builtin_floor(11 * (long double)_M_max_load_factor);
+ return 11;
}
// Number of primes (without sentinel).
@@ -66,7 +63,7 @@ namespace __detail
constexpr auto __last_prime = __prime_list + __n_primes - 1;
const unsigned long* __next_bkt =
- std::lower_bound(__prime_list + 6, __last_prime, __n);
+ std::lower_bound(__prime_list + 5, __last_prime, __n);
if (__next_bkt == __last_prime)
// Set next resize to the max value so that we never try to rehash again
diff --git a/libstdc++-v3/testsuite/performance/23_containers/insert_erase/unordered_small_size.cc b/libstdc++-v3/testsuite/performance/23_containers/insert_erase/unordered_small_size.cc
new file mode 100644
index 00000000000..a2f44c07082
--- /dev/null
+++ b/libstdc++-v3/testsuite/performance/23_containers/insert_erase/unordered_small_size.cc
@@ -0,0 +1,269 @@
+// { dg-do run { target c++11 } }
+
+// Copyright (C) 2018 Free Software Foundation, Inc.
+//
+// This file is part of the GNU ISO C++ Library. This library is free
+// software; you can redistribute it and/or modify it under the
+// terms of the GNU General Public License as published by the
+// Free Software Foundation; either version 3, or (at your option)
+// any later version.
+
+// This library is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU General Public License for more details.
+
+// You should have received a copy of the GNU General Public License along
+// with this library; see the file COPYING3. If not see
+// <http://www.gnu.org/licenses/>.
+
+#include <sstream>
+#include <vector>
+#include <tr1/unordered_set>
+#include <unordered_set>
+#include <testsuite_performance.h>
+namespace
+{
+ const int nb = 20;
+ const int nb_insts = 10000;
+
+ // Bench using an unordered_set<int>. Hash functor for int is quite
+ // predictable so it helps bench very specific use cases.
+ template<typename _ContType>
+ void bench(const char* desc)
+ {
+ using namespace __gnu_test;
+
+ time_counter time;
+ resource_counter resource;
+
+ std::vector<_ContType> insts(nb_insts);
+ for (int j = 0; j != nb_insts; ++j)
+ insts.emplace_back();
+
+ start_counters(time, resource);
+
+ for (auto& us : insts)
+ for (int i = 0; i != nb; ++i)
+ us.insert(i);
+
+ stop_counters(time, resource);
+
+ std::ostringstream ostr;
+ ostr << desc << " 1st insert";
+ report_performance(__FILE__, ostr.str().c_str(), time, resource);
+
+ start_counters(time, resource);
+
+ // Here is the worst erase use case when hashtable implementation was
+ // something like vector<forward_list<>>. Erasing from the end was very
+ // costly because we need to return the iterator following the erased
+ // one, as the hashtable is getting emptier at each step there are
+ // more and more empty bucket to loop through to reach the end of the
+ // container and find out that it was in fact the last element.
+ for (auto& us : insts)
+ for (int i = nb - 1; i >= 0; --i)
+ {
+ auto it = us.find(i);
+ if (it != us.end())
+ us.erase(it);
+ }
+
+ stop_counters(time, resource);
+
+ ostr.str("");
+ ostr << desc << " erase it ";
+ report_performance(__FILE__, ostr.str().c_str(), time, resource);
+
+ start_counters(time, resource);
+
+ // This is a worst insertion use case for the current implementation as
+ // we insert an element at the beginning of the hashtable and then we
+ // insert starting at the end so that each time we need to seek up to the
+ // first bucket to find the first non-empty one.
+ for (auto& us : insts)
+ {
+ us.insert(0);
+ for (int i = nb - 1; i >= 0; --i)
+ us.insert(i);
+ }
+
+ stop_counters(time, resource);
+ ostr.str("");
+ ostr << desc << " 2nd insert";
+ report_performance(__FILE__, ostr.str().c_str(), time, resource);
+
+ start_counters(time, resource);
+
+ for (auto& us : insts)
+ for (int j = nb - 1; j >= 0; --j)
+ us.erase(j);
+
+ stop_counters(time, resource);
+ ostr.str("");
+ ostr << desc << " erase key ";
+ report_performance(__FILE__, ostr.str().c_str(), time, resource);
+ }
+
+ // Bench using unordered_set<string> that show how important it is to cache
+ // hash code as computing string hash code is quite expensive compared to
+ // computing it for int.
+ template<typename _ContType>
+ void bench_str(const char* desc)
+ {
+ using namespace __gnu_test;
+
+ time_counter time;
+ resource_counter resource;
+
+ // First generate once strings that are going to be used throughout the
+ // bench:
+ std::ostringstream ostr;
+ std::vector<std::string> strs;
+ {
+ strs.reserve(nb);
+ for (int i = 0; i != nb; ++i)
+ {
+ ostr.str("");
+ ostr << "string #" << i;
+ strs.push_back(ostr.str());
+ }
+ }
+
+ std::vector<_ContType> insts(nb_insts);
+ for (int j = 0; j != nb_insts; ++j)
+ insts.emplace_back();
+
+ start_counters(time, resource);
+
+ for (auto& us : insts)
+ for (int i = 0; i != nb; ++i)
+ us.insert(strs[i]);
+
+ stop_counters(time, resource);
+
+ ostr.str("");
+ ostr << desc << " 1st insert";
+ report_performance(__FILE__, ostr.str().c_str(), time, resource);
+
+ start_counters(time, resource);
+
+ for (auto& us : insts)
+ for (int j = nb - 1; j >= 0; --j)
+ {
+ auto it = us.find(strs[j]);
+ if (it != us.end())
+ us.erase(it);
+ }
+
+ stop_counters(time, resource);
+
+ ostr.str("");
+ ostr << desc << " erase iter";
+ report_performance(__FILE__, ostr.str().c_str(), time, resource);
+
+ start_counters(time, resource);
+
+ for (auto& us : insts)
+ {
+ us.insert(strs[0]);
+ for (int i = nb - 1; i >= 0; --i)
+ us.insert(strs[i]);
+ }
+
+ stop_counters(time, resource);
+ ostr.str("");
+ ostr << desc << " 2nd insert";
+ report_performance(__FILE__, ostr.str().c_str(), time, resource);
+
+ start_counters(time, resource);
+
+ for (auto& us : insts)
+ for (int j = nb - 1; j >= 0; --j)
+ us.erase(strs[j]);
+
+ stop_counters(time, resource);
+ ostr.str("");
+ ostr << desc << " erase key ";
+ report_performance(__FILE__, ostr.str().c_str(), time, resource);
+ }
+}
+
+template<bool cache>
+ using __uset =
+ std::__uset_hashtable<int, std::hash<int>, std::equal_to<int>,
+ std::allocator<int>,
+ std::__uset_traits<cache>>;
+
+template<bool cache>
+ using __tr1_uset =
+ std::tr1::__unordered_set<int, std::hash<int>, std::equal_to<int>,
+ std::allocator<int>,
+ cache>;
+
+template<bool cache>
+ using __uset2 =
+ std::_Hashtable<int, int, std::allocator<int>,
+ std::__detail::_Identity,
+ std::equal_to<int>, std::hash<int>,
+ std::__detail::_Mask_range_hashing,
+ std::__detail::_Default_ranged_hash,
+ std::__detail::_Power2_rehash_policy,
+ std::__uset_traits<cache>>;
+
+template<bool cache>
+ using __str_uset =
+ std::__uset_hashtable<std::string, std::hash<std::string>,
+ std::equal_to<std::string>,
+ std::allocator<std::string>,
+ std::__uset_traits<cache>>;
+
+template<bool cache>
+ using __tr1_str_uset =
+ std::tr1::__unordered_set<std::string, std::hash<std::string>,
+ std::equal_to<std::string>,
+ std::allocator<std::string>,
+ cache>;
+
+template<bool cache>
+ using __str_uset2 =
+ std::_Hashtable<std::string, std::string, std::allocator<std::string>,
+ std::__detail::_Identity,
+ std::equal_to<std::string>, std::hash<std::string>,
+ std::__detail::_Mask_range_hashing,
+ std::__detail::_Default_ranged_hash,
+ std::__detail::_Power2_rehash_policy,
+ std::__uset_traits<cache>>;
+
+int main()
+{
+ bench<__tr1_uset<false>>(
+ "std::tr1::unordered_set<int> without hash code cached: ");
+ bench<__tr1_uset<true>>(
+ "std::tr1::unordered_set<int> with hash code cached: ");
+ bench<__uset<false>>(
+ "std::unordered_set<int> without hash code cached: ");
+ bench<__uset<true>>(
+ "std::unordered_set<int> with hash code cached: ");
+ bench<std::unordered_set<int>>(
+ "std::unordered_set<int> default cache: ");
+ bench<__uset2<false>>(
+ "std::unordered_set2<int> without hash code cached: ");
+ bench<__uset2<true>>(
+ "std::unordered_set2<int> with hash code cached: ");
+ bench_str<__tr1_str_uset<false>>(
+ "std::tr1::unordered_set<string> without hash code cached:");
+ bench_str<__tr1_str_uset<true>>(
+ "std::tr1::unordered_set<string> with hash code cached: ");
+ bench_str<__str_uset<false>>(
+ "std::unordered_set<string> without hash code cached: ");
+ bench_str<__str_uset<true>>(
+ "std::unordered_set<string> with hash code cached: ");
+ bench_str<std::unordered_set<std::string>>(
+ "std::unordered_set<string> default cache: ");
+ bench_str<__str_uset2<false>>(
+ "std::unordered_set2<string> without hash code cached: ");
+ bench_str<__str_uset2<true>>(
+ "std::unordered_set2<string> with hash code cached: ");
+ return 0;
+}
diff --git a/libstdc++-v3/testsuite/util/testsuite_performance.h b/libstdc++-v3/testsuite/util/testsuite_performance.h
index 3dfd471221d..7ebf2d5b849 100644
--- a/libstdc++-v3/testsuite/util/testsuite_performance.h
+++ b/libstdc++-v3/testsuite/util/testsuite_performance.h
@@ -224,7 +224,7 @@ namespace __gnu_test
out << std::setw(4) << t.real_time() << "r" << space;
out << std::setw(4) << t.user_time() << "u" << space;
out << std::setw(4) << t.system_time() << "s" << space;
- out << std::setw(8) << r.allocated_memory() << "mem" << space;
+ out << std::setw(9) << r.allocated_memory() << "mem" << space;
out << std::setw(4) << r.hard_page_fault() << "pf" << space;
out << std::endl;