Hi Paul,
On 6/29/21 1:08 PM, Paul A. Clarke via Gcc-patches wrote:
_mm_blend_epi16 and _mm_blendv_epi8 were added earlier.
Add these four to complete the set.
2021-06-29 Paul A. Clarke <p...@us.ibm.com>
gcc/ChangeLog:
* config/rs6000/smmintrin.h (_mm_blend_pd, _mm_blendv_pd,
_mm_blend_ps, _mm_blendv_ps): New.
---
gcc/config/rs6000/smmintrin.h | 46 +++++++++++++++++++++++++++++++++++
1 file changed, 46 insertions(+)
diff --git a/gcc/config/rs6000/smmintrin.h b/gcc/config/rs6000/smmintrin.h
index 1b8cad135ed0..fa17a8b2f478 100644
--- a/gcc/config/rs6000/smmintrin.h
+++ b/gcc/config/rs6000/smmintrin.h
@@ -116,6 +116,52 @@ _mm_blendv_epi8 (__m128i __A, __m128i __B, __m128i __mask)
return (__m128i) vec_sel ((__v16qu) __A, (__v16qu) __B, __lmask);
}
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__,
__artificial__))
Usual line length complaint. :) Here and below...
+_mm_blend_pd (__m128d __A, __m128d __B, const int __imm8)
+{
+ const signed char __tmp = (__imm8 & 0b10) * 0b01111000 |
+ (__imm8 & 0b01) * 0b00001111;
+ __v16qi __charmask = vec_splats ((signed char) __tmp);
+ __charmask = vec_gb (__charmask);
+ __v8hu __shortmask = (__v8hu) vec_unpackh (__charmask);
+ #ifdef __BIG_ENDIAN__
+ __shortmask = vec_reve (__shortmask);
+ #endif
+ return (__m128d) vec_sel ((__v2du) __A, (__v2du) __B, (__v2du) __shortmask);
This seems way too complex, and needs commentary to explain what you're
doing. Doesn't this instruction just translate into some form of
xxpermdi? Different ones for BE and LE, but still just xxpermdi, I think.
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__,
__artificial__))
+_mm_blendv_pd (__m128d __A, __m128d __B, __m128d __mask)
+{
+ const __v2di __zero = {0};
+ const vector __bool long long __boolmask = vec_cmplt ((__v2di) __mask,
__zero);
+ return (__m128d) vec_sel ((__v2du) __A, (__v2du) __B, (__v2du) __boolmask);
+}
Okay.
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__,
__artificial__))
+_mm_blend_ps (__m128 __A, __m128 __B, const int __imm8)
+{
+ const signed char __mask = (__imm8 & 0b1000) * 0b00011000 |
+ (__imm8 & 0b0100) * 0b00001100 |
+ (__imm8 & 0b0010) * 0b00000110 |
+ (__imm8 & 0b0001) * 0b00000011;
+ __v16qi __charmask = vec_splats ( __mask);
+ __charmask = vec_gb (__charmask);
+ __v8hu __shortmask = (__v8hu) vec_unpackh (__charmask);
This is a good trick, but you need comments to explain what you're
doing, including how you build __mask. I recommend you include
alternate code for P10, where you can just use vec_genwm to expand from
__mask to a mask of word elements.
I don't understand how you're getting away with a v8hu mask for word
elements. This seems wrong to me. Adequate testing?
+ #ifdef __BIG_ENDIAN__
+ __shortmask = vec_reve (__shortmask);
+ #endif
+ return (__m128) vec_sel ((__v8hu) __A, (__v8hu) __B, __shortmask);
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__,
__artificial__))
+_mm_blendv_ps (__m128 __A, __m128 __B, __m128 __mask)
+{
+ const __v4si __zero = {0};
+ const vector __bool int __boolmask = vec_cmplt ((__v4si) __mask, __zero);
+ return (__m128) vec_sel ((__v4su) __A, (__v4su) __B, (__v4su) __boolmask);
+}
+
Okay.
Please have a look at the above issues and resubmit. Thanks!
Bill
extern __inline int __attribute__((__gnu_inline__, __always_inline__,
__artificial__))
_mm_testz_si128 (__m128i __A, __m128i __B)
{