Hi All, We are currently generating a loop which has more comparisons than you'd typically need as the probablities on the small size loop are such that it assumes the likely case is that an element is not found.
This again generates a pattern that's harder for branch predictors to follow, but also just generates more instructions for the what one could say is the typical case: That your hashtable contains the entry you are looking for. This patch adds a __builtin_expect to indicate that it's likely that you'd find the element that's being searched for. The second change is in _M_find_before_node where at the moment the loop is optimized for the case where we don't do any iterations. A simple testcase is: #include <stdbool.h> bool foo (int **a, int n, int val, int *tkn) { for (int i = 0; i < n; i++) { if (!a[i] || a[i]==tkn) return false; if (*a[i] == val) return true; } } which generataes: foo: cmp w1, 0 ble .L1 add x1, x0, w1, uxtw 3 b .L4 .L9: ldr w4, [x4] cmp w4, w2 beq .L6 cmp x0, x1 beq .L1 .L4: ldr x4, [x0] add x0, x0, 8 cmp x4, 0 ccmp x4, x3, 4, ne bne .L9 mov w0, 0 .L1: ret .L6: mov w0, 1 ret i.e. BB rotation makes is generate an unconditional branch to a conditional branch. However this method is only called when the size is above a certain threshold, and so it's likely that we have to do that first iteration. Adding: #include <stdbool.h> bool foo (int **a, int n, int val, int *tkn) { for (int i = 0; i < n; i++) { if (__builtin_expect(!a[i] || a[i]==tkn, 0)) return false; if (*a[i] == val) return true; } } to indicate that we will likely do an iteration more generates: foo: cmp w1, 0 ble .L1 add x1, x0, w1, uxtw 3 .L4: ldr x4, [x0] add x0, x0, 8 cmp x4, 0 ccmp x4, x3, 4, ne beq .L5 ldr w4, [x4] cmp w4, w2 beq .L6 cmp x0, x1 bne .L4 .L1: ret .L5: mov w0, 0 ret .L6: mov w0, 1 ret which results in ~0-20% extra on top of the previous patch. In table form: +-----------+--------------------+-------+----------+----------------+ | Group | Benchmark | Size | % Inline | % Unlikely all | +-----------+--------------------+-------+----------+----------------+ | Find | unord<uint64_t | 11264 | 53.52% | 64.81% | | Find | unord<uint64_t | 11264 | 47.98% | 62.21% | | Find Many | unord<uint64_t | 11 | 47.62% | 45.32% | | Find Many | unord<uint64_t | 11 | 40.90% | 41.99% | | Find Many | unord<std::string | 11 | 44.94% | 41.25% | | Find Many | unord<std::string | 11 | 44.89% | 41.19% | | Find | unord<NonSSOString | 11 | 31.17% | 35.17% | | Find | unord<NonSSOString | 11 | 31.80% | 33.77% | | Find | unord<uint64_t | 352 | 28.27% | 30.79% | | Find Many | unord<uint64_t | 352 | 30.57% | 30.57% | | Find Many | unord<uint64_t | 11264 | 20.71% | 30.34% | | Find Many | unord<uint64_t | 11264 | 21.33% | 29.87% | | Find Many | unord<NonSSOString | 352 | 25.93% | 29.09% | | Find Many | unord<NonSSOString | 352 | 26.59% | 28.07% | | Find Many | unord<uint64_t | 352 | 26.80% | 26.67% | | Find | unord<std::string | 11 | 25.66% | 24.44% | | Find Many | unord<std::string | 352 | 23.12% | 24.05% | | Find Many | unord<NonSSOString | 11 | 23.21% | 23.08% | | Find Many | unord<std::string | 352 | 19.23% | 22.08% | | Find Many | unord<NonSSOString | 11 | 22.04% | 22.04% | | Find | unord<uint64_t | 352 | 15.43% | 19.37% | | Find | unord<std::string | 11 | 20.36% | 17.48% | | Find | unord<std::string | 352 | 18.59% | 14.94% | | Find | unord<NonSSOString | 352 | 13.79% | 13.76% | | Find | unord<NonSSOString | 11264 | 7.04% | 13.38% | | Find | unord<NonSSOString | 11264 | 9.19% | 13.24% | | Find | unord<std::string | 11264 | 11.80% | 12.73% | | Find Many | unord<std::string | 11264 | 8.78% | 12.44% | | Find Many | unord<std::string | 11264 | 5.30% | 10.86% | | Find Many | unord<NonSSOString | 11264 | 6.15% | 8.85% | | Find Many | unord<NonSSOString | 11264 | 5.90% | 8.49% | | Find | unord<NonSSOString | 352 | 9.69% | 8.30% | | Find | unord<std::string | 11264 | 9.97% | 6.30% | | Find | vec<uint64_t | 352 | 6.20% | 6.13% | +-----------+--------------------+-------+----------+----------------+ Bootstrapped Regtested on aarch64-none-linux-gnu and no issues. Ok for master? and possible backports? Thanks, Tamar libstdc++-v3/ChangeLog: * include/bits/hashtable.h (_M_locate): Make it likely that we find an element. (_M_find_before_node): Make it likely that the map has at least one entry and so we do at least one iteration. --- diff --git a/libstdc++-v3/include/bits/hashtable.h b/libstdc++-v3/include/bits/hashtable.h index e791e52ec329277474f3218d8a44cd37ded14ac3..8101d868d0c5f7ac4f97931affcf71d826c88094 100644 --- a/libstdc++-v3/include/bits/hashtable.h +++ b/libstdc++-v3/include/bits/hashtable.h @@ -2171,7 +2171,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION if (this->_M_equals(__k, __code, *__p)) return __prev_p; - if (!__p->_M_nxt || _M_bucket_index(*__p->_M_next()) != __bkt) + if (__builtin_expect (!__p->_M_nxt || _M_bucket_index(*__p->_M_next()) != __bkt, 0)) break; __prev_p = __p; } @@ -2201,7 +2201,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION if (this->_M_equals_tr(__k, __code, *__p)) return __prev_p; - if (!__p->_M_nxt || _M_bucket_index(*__p->_M_next()) != __bkt) + if (__builtin_expect (!__p->_M_nxt || _M_bucket_index(*__p->_M_next()) != __bkt, 0)) break; __prev_p = __p; } @@ -2228,7 +2228,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION pointer_to(const_cast<__node_base&>(_M_before_begin)); while (__loc._M_before->_M_nxt) { - if (this->_M_key_equals(__k, *__loc._M_node())) + if (__builtin_expect (this->_M_key_equals(__k, *__loc._M_node()), 1)) return __loc; __loc._M_before = __loc._M_before->_M_nxt; } --
diff --git a/libstdc++-v3/include/bits/hashtable.h b/libstdc++-v3/include/bits/hashtable.h index e791e52ec329277474f3218d8a44cd37ded14ac3..8101d868d0c5f7ac4f97931affcf71d826c88094 100644 --- a/libstdc++-v3/include/bits/hashtable.h +++ b/libstdc++-v3/include/bits/hashtable.h @@ -2171,7 +2171,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION if (this->_M_equals(__k, __code, *__p)) return __prev_p; - if (!__p->_M_nxt || _M_bucket_index(*__p->_M_next()) != __bkt) + if (__builtin_expect (!__p->_M_nxt || _M_bucket_index(*__p->_M_next()) != __bkt, 0)) break; __prev_p = __p; } @@ -2201,7 +2201,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION if (this->_M_equals_tr(__k, __code, *__p)) return __prev_p; - if (!__p->_M_nxt || _M_bucket_index(*__p->_M_next()) != __bkt) + if (__builtin_expect (!__p->_M_nxt || _M_bucket_index(*__p->_M_next()) != __bkt, 0)) break; __prev_p = __p; } @@ -2228,7 +2228,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION pointer_to(const_cast<__node_base&>(_M_before_begin)); while (__loc._M_before->_M_nxt) { - if (this->_M_key_equals(__k, *__loc._M_node())) + if (__builtin_expect (this->_M_key_equals(__k, *__loc._M_node()), 1)) return __loc; __loc._M_before = __loc._M_before->_M_nxt; }