I knew you were going to ask for it but was to impatient to propose those patches to wait anymore.

Attached you'll find what I start to work on. But I am quite disappointed by the results. At least it's showing  that results are not worst.

To be honest I was also hoping some feedback from potential users interesting in testing those patches. And maybe there are some well known (and free) benches that I could challenge ?

François

On 21/06/22 19:12, Jonathan Wakely wrote:
On Mon, 20 Jun 2022 at 17:58, François Dumont via Libstdc++
<libstd...@gcc.gnu.org> wrote:
Hi

Here is a series of patch to enhance _Hashtable behavior mostly in the
context of range insertion. I also start considering the problem of
memory fragmentation in this container with 2 objectives:

- It is easier to find out when you're done with the elements of a
bucket if the last node of the bucket N is the before-begin node of
bucket N + 1.

- It is faster to loop through nodes of a bucket if those node are close
in memory, ultimately we should have addressof(Node + 1) ==
addressof(Node) + 1
Have these changes been profiled or benchmarked? Is it measurably
faster? By how much?


[1/5] Make more use of user hints as both insertion and allocation hints.

[2/5] Introduce a new method to check if we are still looping through
the same bucket's nodes

[3/5] Consider that all initializer_list elements are going to be inserted

[4/5] Introduce a before-begin cache policy to remember which bucket is
currently pointing on it

[5/5] Prealloc nodes on _Hashtable copy and introduce a new assignment
method which replicate buckets data structure

François

diff --git a/libstdc++-v3/testsuite/performance/23_containers/insert/54075.cc b/libstdc++-v3/testsuite/performance/23_containers/insert/54075.cc
index 8aa0cd20193..bb5778a257c 100644
--- a/libstdc++-v3/testsuite/performance/23_containers/insert/54075.cc
+++ b/libstdc++-v3/testsuite/performance/23_containers/insert/54075.cc
@@ -17,12 +17,14 @@
 
 // { dg-do run { target c++11 } }
 
-#include <testsuite_performance.h>
+#include <string>
 #include <random>
 #include <sstream>
 #include <tr1/unordered_set>
 #include <unordered_set>
 
+#include <testsuite_performance.h>
+
 #define USE_MY_FOO 1
 
 struct Foo
@@ -71,10 +73,13 @@ struct HashFunction
 };
 
 const int sz = 300000;
+const int usz = sz / 2;
 
 template<typename _ContType>
   void
-  bench(const char* container_desc, const typename _ContType::value_type* foos)
+  bench(const char* container_desc,
+	const typename _ContType::value_type* foos,
+	const typename _ContType::value_type* ufoos)
   {
     using namespace __gnu_test;
 
@@ -106,6 +111,51 @@ template<typename _ContType>
     ostr << container_desc << nb_loop << " times insertion of "
 	 << sz << " elements";
     report_performance(__FILE__, ostr.str().c_str(), time, resource);
+
+    // Try to lookup for mostly unknown entries.
+    start_counters(time, resource);
+
+    int fcount = 0;
+    for (int j = 0; j != nb_loop; ++j)
+      for (int i = 0; i != usz; ++i)
+	fcount += s.find(ufoos[i]) != s.end() ? 1 : 0;
+
+    stop_counters(time, resource);
+    ostr.str("");
+    ostr << container_desc << nb_loop << " times lookup of "
+	 << usz << " elements " << fcount / nb_loop << " found";
+    report_performance(__FILE__, ostr.str().c_str(), time, resource);
+
+    // Try again the previous operations but on a copy with potentially
+    // less memory fragmentation.
+    _ContType scopy(s);
+
+    // Try to insert again to check performance of collision detection
+    start_counters(time, resource);
+
+    for (int j = 0; j != nb_loop; ++j)
+      for (int i = 0; i != sz; ++i)
+	scopy.insert(foos[i]);
+
+    stop_counters(time, resource);
+    ostr.str("");
+    ostr << container_desc << nb_loop << " times insertion of "
+	 << sz << " elements in copy";
+    report_performance(__FILE__, ostr.str().c_str(), time, resource);
+
+    // Try to lookup for mostly unknown entries.
+    start_counters(time, resource);
+
+    fcount = 0;
+    for (int j = 0; j != nb_loop; ++j)
+      for (int i = 0; i != usz; ++i)
+	fcount += scopy.find(ufoos[i]) != scopy.end() ? 1 : 0;
+
+    stop_counters(time, resource);
+    ostr.str("");
+    ostr << container_desc << nb_loop << " times lookup of "
+	 << usz << " elements " << fcount / nb_loop << " found";
+    report_performance(__FILE__, ostr.str().c_str(), time, resource);
   }
 
 template<bool cache>
@@ -155,67 +205,78 @@ int main()
 
   {
     int bars[sz];
+    int ubars[usz];
     for (int i = 0; i != sz; ++i)
       bars[i] = i;
+    for (int i = 0; i != usz; ++i)
+      ubars[i] = sz + i;
     bench<std::tr1::unordered_set<int>>(
-	"std::tr1::unordered_set<int> ", bars);
+      "std::tr1::unordered_set<int> ", bars, ubars);
     bench<std::unordered_set<int>>(
-	"std::unordered_set<int> ", bars);
+      "std::unordered_set<int> ", bars, ubars);
   }
 
-  Foo foos[sz];
-#if USE_MY_FOO
   {
-    std::random_device randev;
-    for (int i = 0; i != sz; ++i)
-      foos[i].init(randev);
-  }
+    Foo foos[sz];
+    Foo ufoos[usz];
+#if USE_MY_FOO
+    {
+      std::random_device randev;
+      for (int i = 0; i != sz; ++i)
+	foos[i].init(randev);
+      for (int i = 0; i != usz; ++i)
+	ufoos[i].init(randev);
+    }
 #endif
 
-  time_counter time;
-  resource_counter resource;
-  start_counters(time, resource);
-
-  bench<__tr1_uset<false>>(
-	"std::tr1::unordered_set without hash code cached ", foos);
-  bench<__tr1_uset<true>>(
-	"std::tr1::unordered_set with hash code cached ", foos);
-  bench<__tr1_umset<false>>(
-	"std::tr1::unordered_multiset without hash code cached ", foos);
-  bench<__tr1_umset<true>>(
-	"std::tr1::unordered_multiset with hash code cached ", foos);
-
-  stop_counters(time, resource);
-  report_performance(__FILE__, "tr1 benches", time, resource);
-
-  start_counters(time, resource);
-  bench<__uset<false>>(
-	"std::unordered_set without hash code cached ", foos);
-  bench<__uset<true>>(
-	"std::unordered_set with hash code cached ", foos);
-  bench<__umset<false>>(
-	"std::unordered_multiset without hash code cached ", foos);
-  bench<__umset<true>>(
-	"std::unordered_multiset with hash code cached ", foos);
-
-  stop_counters(time, resource);
-  report_performance(__FILE__, "std benches", time, resource);
-
-  start_counters(time, resource);
-  bench<__uset2<false>>(
-	"std::unordered_set2 without hash code cached ", foos);
-  bench<__uset2<true>>(
-	"std::unordered_set2 with hash code cached ", foos);
-  bench<__umset2<false>>(
-	"std::unordered_multiset2 without hash code cached ", foos);
-  bench<__umset2<true>>(
-	"std::unordered_multiset2 with hash code cached ", foos);
-
-  stop_counters(time, resource);
-  report_performance(__FILE__, "std2 benches", time, resource);
-
-  bench<std::unordered_set<Foo, HashFunction>>(
-	"std::unordered_set default cache ", foos);
-  bench<std::unordered_multiset<Foo, HashFunction>>(
-	"std::unordered_multiset default cache ", foos);
+    time_counter time;
+    resource_counter resource;
+    start_counters(time, resource);
+
+    bench<__tr1_uset<false>>(
+      "std::tr1::unordered_set without hash code cached ", foos, ufoos);
+    bench<__tr1_uset<true>>(
+      "std::tr1::unordered_set with hash code cached ", foos, ufoos);
+    bench<__tr1_umset<false>>(
+      "std::tr1::unordered_multiset without hash code cached ", foos, ufoos);
+    bench<__tr1_umset<true>>(
+      "std::tr1::unordered_multiset with hash code cached ", foos, ufoos);
+
+    stop_counters(time, resource);
+    report_performance(__FILE__, "tr1 benches", time, resource);
+
+    start_counters(time, resource);
+    bench<__uset<false>>(
+      "std::unordered_set without hash code cached ", foos, ufoos);
+    bench<__uset<true>>(
+      "std::unordered_set with hash code cached ", foos, ufoos);
+    bench<__umset<false>>(
+      "std::unordered_multiset without hash code cached ", foos, ufoos);
+    bench<__umset<true>>(
+      "std::unordered_multiset with hash code cached ", foos, ufoos);
+
+    stop_counters(time, resource);
+    report_performance(__FILE__, "std benches", time, resource);
+
+    start_counters(time, resource);
+    bench<__uset2<false>>(
+      "std::unordered_set2 without hash code cached ", foos, ufoos);
+    bench<__uset2<true>>(
+      "std::unordered_set2 with hash code cached ", foos, ufoos);
+    bench<__umset2<false>>(
+      "std::unordered_multiset2 without hash code cached ", foos, ufoos);
+    bench<__umset2<true>>(
+      "std::unordered_multiset2 with hash code cached ", foos, ufoos);
+
+    stop_counters(time, resource);
+    report_performance(__FILE__, "std2 benches", time, resource);
+
+    bench<std::unordered_set<Foo, HashFunction>>(
+      "std::unordered_set default cache ", foos, ufoos);
+    bench<std::unordered_multiset<Foo, HashFunction>>(
+      "std::unordered_multiset default cache ", foos, ufoos);
+  }
+
+  {
+  }
 }
diff --git a/libstdc++-v3/testsuite/performance/23_containers/insert_erase/unordered_small_size.cc b/libstdc++-v3/testsuite/performance/23_containers/insert_erase/unordered_small_size.cc
index ae63c15b5da..a23b20bf69d 100644
--- a/libstdc++-v3/testsuite/performance/23_containers/insert_erase/unordered_small_size.cc
+++ b/libstdc++-v3/testsuite/performance/23_containers/insert_erase/unordered_small_size.cc
@@ -29,7 +29,7 @@ namespace
   const int nb_insts = 150000;
 
   template<typename _ElemType>
-    void bench(const char* desc, const std::vector<_ElemType>& elems)
+    void bench(const char* desc, const std::vector<_ElemType>& elems, bool with_copy)
     {
       using namespace __gnu_test;
 
@@ -52,6 +52,19 @@ namespace
       ostr << desc << " 1st insert";
       report_performance(__FILE__, ostr.str().c_str(), time, resource);
 
+      if (with_copy)
+	{
+	  start_counters(time, resource);
+
+	  std::vector<std::unordered_set<_ElemType>>(insts).swap(insts);
+
+	  stop_counters(time, resource);
+
+	  ostr.str("");
+	  ostr << desc << " copy";
+	  report_performance(__FILE__, ostr.str().c_str(), time, resource);
+	}
+
       start_counters(time, resource);
 
       for (auto& us : insts)
@@ -103,7 +116,8 @@ int main()
     for (int i = 0; i != nb_elements; ++i)
       elems.push_back(i);
 
-    bench("std::unordered_set<int>:    ", elems);
+    bench("std::unordered_set<int>:    ", elems, false);
+    bench("std::unordered_set<int>:    ", elems, true);
   }
 
   {
@@ -118,7 +132,8 @@ int main()
 	}
     }
 
-    bench("std::unordered_set<string>: ", elems);
+    bench("std::unordered_set<string>: ", elems, false);
+    bench("std::unordered_set<string>: ", elems, true);
   }
 
   return 0;

Reply via email to