From: yaozhongxiao <yaozhongx...@linux.alibaba.com>

find_first_set and find_last_set method is not optimal for neon,
it need to be improved by synthesized with horizontal adds(vaddv)
which will reduce the generated assembly code; in the following cases,
vaddvq_s16 will generate 2 instructions but vpadd_s16 will generate 4
instrunctions:
```
 # vaddvq_s16
    vaddvq_s16(__asint);
    //  addv    h0, v1.8h
    //  smov    w1, v0.h[0]
 # vpadd_s16
    vpaddq_s16(vpaddq_s16(vpaddq_s16(__asint, __zero), __zero), __zero)[0]
    // addp v1.8h,v1.8h,v2.8h
    // addp v1.8h,v1.8h,v2.8h
    // addp v1.8h,v1.8h,v2.8h
    // smov    w1, v1.h[0]
 #
```

libstdc++-v3/ChangeLog:
        * include/experimental/bits/simd_neon.h: Replace repeated vpadd
        calls with a single vaddv for aarch64.
---
 .../include/experimental/bits/simd_neon.h       | 17 ++++++++++++++---
 1 file changed, 14 insertions(+), 3 deletions(-)

diff --git a/libstdc++-v3/include/experimental/bits/simd_neon.h b/libstdc++-
v3/include/experimental/bits/simd_neon.h
index a3a8ffe165f..0b8ccc17513 100644
--- a/libstdc++-v3/include/experimental/bits/simd_neon.h
+++ b/libstdc++-v3/include/experimental/bits/simd_neon.h
@@ -311,8 +311,7 @@ struct _MaskImplNeonMixin
                  });
              __asint &= __bitsel;
 #ifdef __aarch64__
-             return vpaddq_s16(vpaddq_s16(vpaddq_s16(__asint, __zero), 
__zero),
-                               __zero)[0];
+             return vaddvq_s16(__asint);
 #else
              return vpadd_s16(
                vpadd_s16(vpadd_s16(__lo64(__asint), __hi64(__asint)), __zero),
@@ -328,7 +327,7 @@ struct _MaskImplNeonMixin
                  });
              __asint &= __bitsel;
 #ifdef __aarch64__
-             return vpaddq_s32(vpaddq_s32(__asint, __zero), __zero)[0];
+             return vaddvq_s32(__asint);
 #else
              return vpadd_s32(vpadd_s32(__lo64(__asint), __hi64(__asint)),
                               __zero)[0];
@@ -351,8 +350,12 @@ struct _MaskImplNeonMixin
                    return static_cast<_I>(__i < _Np ? 1 << __i : 0);
                  });
              __asint &= __bitsel;
+#ifdef __aarch64__
+             return vaddv_s8(__asint);
+#else
              return vpadd_s8(vpadd_s8(vpadd_s8(__asint, __zero), __zero),
                              __zero)[0];
+#endif
            }
          else if constexpr (sizeof(_Tp) == 2)
            {
@@ -362,12 +365,20 @@ struct _MaskImplNeonMixin
                    return static_cast<_I>(__i < _Np ? 1 << __i : 0);
                  });
              __asint &= __bitsel;
+#ifdef __aarch64__
+             return vaddv_s16(__asint);
+#else
              return vpadd_s16(vpadd_s16(__asint, __zero), __zero)[0];
+#endif
            }
          else if constexpr (sizeof(_Tp) == 4)
            {
              __asint &= __make_vector<_I>(0x1, 0x2);
+#ifdef __aarch64__
+             return vaddv_s32(__asint);
+#else
              return vpadd_s32(__asint, __zero)[0];
+#endif
            }
          else
            __assert_unreachable<_Tp>();
-- 
──────────────────────────────────────────────────────────────────────────
 Dr. Matthias Kretz                           https://mattkretz.github.io
 GSI Helmholtz Centre for Heavy Ion Research               https://gsi.de
 std::experimental::simd              https://github.com/VcDevel/std-simd
──────────────────────────────────────────────────────────────────────────




Reply via email to