On Fri, 13 Dec 2024 at 17:13, Tamar Christina <tamar.christ...@arm.com> wrote:
>
> Hi All,
>
> We are currently generating a loop which has more comparisons than you'd
> typically need as the probablities on the small size loop are such that it
> assumes the likely case is that an element is not found.
>
> This again generates a pattern that's harder for branch predictors to follow,
> but also just generates more instructions for the what one could say is the
> typical case: That your hashtable contains the entry you are looking for.
>
> This patch adds a __builtin_expect to indicate that it's likely that you'd
> find the element that's being searched for.
>
> The second change is in _M_find_before_node where at the moment the loop
> is optimized for the case where we don't do any iterations.
>
> A simple testcase is:
>
> #include <stdbool.h>
>
> bool foo (int **a, int n, int val, int *tkn)
> {
>     for (int i = 0; i < n; i++)
>     {
>         if (!a[i] || a[i]==tkn)
>           return false;
>
>         if (*a[i] == val)
>           return true;
>     }
> }
>
> which generataes:
>
> foo:
>         cmp     w1, 0
>         ble     .L1
>         add     x1, x0, w1, uxtw 3
>         b       .L4
> .L9:
>         ldr     w4, [x4]
>         cmp     w4, w2
>         beq     .L6
>         cmp     x0, x1
>         beq     .L1
> .L4:
>         ldr     x4, [x0]
>         add     x0, x0, 8
>         cmp     x4, 0
>         ccmp    x4, x3, 4, ne
>         bne     .L9
>         mov     w0, 0
> .L1:
>         ret
> .L6:
>         mov     w0, 1
>         ret
>
> i.e. BB rotation makes is generate an unconditional branch to a conditional
> branch. However this method is only called when the size is above a certain
> threshold, and so it's likely that we have to do that first iteration.
>
> Adding:
>
> #include <stdbool.h>
>
> bool foo (int **a, int n, int val, int *tkn)
> {
>     for (int i = 0; i < n; i++)
>     {
>         if (__builtin_expect(!a[i] || a[i]==tkn, 0))
>           return false;
>
>         if (*a[i] == val)
>           return true;
>     }
> }
>
> to indicate that we will likely do an iteration more generates:
>
> foo:
>         cmp     w1, 0
>         ble     .L1
>         add     x1, x0, w1, uxtw 3
> .L4:
>         ldr     x4, [x0]
>         add     x0, x0, 8
>         cmp     x4, 0
>         ccmp    x4, x3, 4, ne
>         beq     .L5
>         ldr     w4, [x4]
>         cmp     w4, w2
>         beq     .L6
>         cmp     x0, x1
>         bne     .L4
> .L1:
>         ret
> .L5:
>         mov     w0, 0
>         ret
> .L6:
>         mov     w0, 1
>         ret
>
> which results in ~0-20% extra on top of the previous patch.
>
> In table form:
>
> +-----------+--------------------+-------+----------+----------------+
> | Group     | Benchmark          | Size  | % Inline | % Unlikely all |
> +-----------+--------------------+-------+----------+----------------+
> | Find      | unord<uint64_t     | 11264 | 53.52%   | 64.81%         |
> | Find      | unord<uint64_t     | 11264 | 47.98%   | 62.21%         |
> | Find Many | unord<uint64_t     | 11    | 47.62%   | 45.32%         |
> | Find Many | unord<uint64_t     | 11    | 40.90%   | 41.99%         |
> | Find Many | unord<std::string  | 11    | 44.94%   | 41.25%         |
> | Find Many | unord<std::string  | 11    | 44.89%   | 41.19%         |
> | Find      | unord<NonSSOString | 11    | 31.17%   | 35.17%         |
> | Find      | unord<NonSSOString | 11    | 31.80%   | 33.77%         |
> | Find      | unord<uint64_t     | 352   | 28.27%   | 30.79%         |
> | Find Many | unord<uint64_t     | 352   | 30.57%   | 30.57%         |
> | Find Many | unord<uint64_t     | 11264 | 20.71%   | 30.34%         |
> | Find Many | unord<uint64_t     | 11264 | 21.33%   | 29.87%         |
> | Find Many | unord<NonSSOString | 352   | 25.93%   | 29.09%         |
> | Find Many | unord<NonSSOString | 352   | 26.59%   | 28.07%         |
> | Find Many | unord<uint64_t     | 352   | 26.80%   | 26.67%         |
> | Find      | unord<std::string  | 11    | 25.66%   | 24.44%         |
> | Find Many | unord<std::string  | 352   | 23.12%   | 24.05%         |
> | Find Many | unord<NonSSOString | 11    | 23.21%   | 23.08%         |
> | Find Many | unord<std::string  | 352   | 19.23%   | 22.08%         |
> | Find Many | unord<NonSSOString | 11    | 22.04%   | 22.04%         |
> | Find      | unord<uint64_t     | 352   | 15.43%   | 19.37%         |
> | Find      | unord<std::string  | 11    | 20.36%   | 17.48%         |
> | Find      | unord<std::string  | 352   | 18.59%   | 14.94%         |
> | Find      | unord<NonSSOString | 352   | 13.79%   | 13.76%         |
> | Find      | unord<NonSSOString | 11264 | 7.04%    | 13.38%         |
> | Find      | unord<NonSSOString | 11264 | 9.19%    | 13.24%         |
> | Find      | unord<std::string  | 11264 | 11.80%   | 12.73%         |
> | Find Many | unord<std::string  | 11264 | 8.78%    | 12.44%         |
> | Find Many | unord<std::string  | 11264 | 5.30%    | 10.86%         |
> | Find Many | unord<NonSSOString | 11264 | 6.15%    | 8.85%          |
> | Find Many | unord<NonSSOString | 11264 | 5.90%    | 8.49%          |
> | Find      | unord<NonSSOString | 352   | 9.69%    | 8.30%          |
> | Find      | unord<std::string  | 11264 | 9.97%    | 6.30%          |
> | Find      | vec<uint64_t       | 352   | 6.20%    | 6.13%          |
> +-----------+--------------------+-------+----------+----------------+
>
> Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
>
> Ok for master? and possible backports?
>
> Thanks,
> Tamar
>
> libstdc++-v3/ChangeLog:
>
>         * include/bits/hashtable.h (_M_locate): Make it likely that we find an
>         element.
>         (_M_find_before_node): Make it likely that the map has at least one
>         entry and so we do at least one iteration.
>
> ---
> diff --git a/libstdc++-v3/include/bits/hashtable.h 
> b/libstdc++-v3/include/bits/hashtable.h
> index 
> e791e52ec329277474f3218d8a44cd37ded14ac3..8101d868d0c5f7ac4f97931affcf71d826c88094
>  100644
> --- a/libstdc++-v3/include/bits/hashtable.h
> +++ b/libstdc++-v3/include/bits/hashtable.h
> @@ -2171,7 +2171,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
>           if (this->_M_equals(__k, __code, *__p))
>             return __prev_p;
>
> -         if (!__p->_M_nxt || _M_bucket_index(*__p->_M_next()) != __bkt)
> +         if (__builtin_expect (!__p->_M_nxt || 
> _M_bucket_index(*__p->_M_next()) != __bkt, 0))
>             break;
>           __prev_p = __p;
>         }
> @@ -2201,7 +2201,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
>             if (this->_M_equals_tr(__k, __code, *__p))
>               return __prev_p;
>
> -           if (!__p->_M_nxt || _M_bucket_index(*__p->_M_next()) != __bkt)
> +           if (__builtin_expect (!__p->_M_nxt || 
> _M_bucket_index(*__p->_M_next()) != __bkt, 0))
>               break;
>             __prev_p = __p;
>           }
> @@ -2228,7 +2228,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
>                pointer_to(const_cast<__node_base&>(_M_before_begin));
>           while (__loc._M_before->_M_nxt)
>             {
> -             if (this->_M_key_equals(__k, *__loc._M_node()))
> +             if (__builtin_expect (this->_M_key_equals(__k, 
> *__loc._M_node()), 1))
>                 return __loc;
>               __loc._M_before = __loc._M_before->_M_nxt;
>             }

The first two changes make sense to me: we're typically going to loop
more than once when searching for an element, so the break is
unlikely. The third change is a bit more surprising, because although
it's likely we'll find an element *eventually* it is only likely once
out of N iterations, not likely on every iteration. But if the numbers
show it helps, then that just shows my intuition is probably wrong.

If this is optimized for 'find' is it possible that this change
pessimizes insertion (which also has to use _M_locate to find where to
insert)?

Reply via email to